In [1]:
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time
from IPython.display import clear_output

from sklearn import pipeline      # Pipeline, make_pipeline
from sklearn import preprocessing # StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import inspection      # permutation_importance
import scikitplot as skplt

In [2]:
########################################################### CLASSIFIERS

#### MULT
from sklearn.linear_model   import LogisticRegression
from sklearn.linear_model   import RidgeClassifier
from sklearn.svm            import SVC
from sklearn.svm            import NuSVC
from sklearn.svm            import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors      import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes    import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble       import StackingClassifier

#### TREE
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier, plot_tree
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
#from ngboost               import NGBClassifier
#from rgf.sklearn           import RGFClassifier, FastRGFClassifier

########################################################### REGRESSORS
from sklearn.linear_model  import ElasticNet, Ridge, Lasso, BayesianRidge, ARDRegression, TweedieRegressor
from sklearn.svm           import LinearSVR, NuSVR, SVR
from sklearn.ensemble      import BaggingRegressor
from sklearn.kernel_ridge  import KernelRidge

In [4]:
!ls ../../csv_files/FS_combos/UNI/freq_log/

UNI_test_freq_log.csv  UNI_train_freq_log.csv


In [6]:
path = "../../csv_files/FS_combos/UNI/freq_log/"

In [7]:
df      = pd.read_csv(path + "UNI_train_freq_log.csv", index_col="Unnamed: 0")
df_test = pd.read_csv(path + "UNI_test_freq_log.csv", index_col="Unnamed: 0")

### Select data for experiment

In [8]:
x = df.drop(columns=["target", 'id']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["target"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['id']) # # X_TEST DATA (NEW DATA)

In [9]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size=0.2,
    random_state=0
)

In [10]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [11]:
y_pred_regr = lr.predict(x_val)
y_pred_regr

array([7.3979304 , 7.33907832, 7.40082518, ..., 7.5194016 , 7.66858539,
       7.42838271])

In [12]:
from sklearn.metrics import mean_squared_error

In [13]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8761753157129161

In [14]:
#lets see what the PCA is 
from sklearn.linear_model import ARDRegression
ar = ARDRegression()
ar.fit(x_train, y_train)

ARDRegression()

In [15]:
y_pred_regr = ar.predict(x_val)
y_pred_regr

array([7.39929752, 7.33525545, 7.39718741, ..., 7.52171667, 7.6678293 ,
       7.42773625])

In [16]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8761941037638772

In [17]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(x_train, y_train)

BayesianRidge()

In [18]:
y_pred_regr = br.predict(x_val)
y_pred_regr

array([7.39819384, 7.3392898 , 7.40103551, ..., 7.51916283, 7.66823714,
       7.42857027])

In [19]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8761757359341672

In [20]:
en = ElasticNet()

In [21]:
en.fit(x_train, y_train)

ElasticNet()

In [22]:
y_pred_regr = en.predict(x_val)
y_pred_regr

array([7.45485702, 7.45485702, 7.45485702, ..., 7.45485702, 7.45485702,
       7.45485702])

In [23]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8854278096232179

In [24]:
r = Ridge()

In [25]:
r.fit(x_train, y_train)

Ridge()

In [26]:
y_pred_regr = r.predict(x_val)
y_pred_regr

array([7.39818387, 7.33928179, 7.40102755, ..., 7.51917186, 7.66825033,
       7.42856317])

In [27]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8761757184428889

In [28]:
l = Lasso()
l.fit(x_train, y_train)

Lasso()

In [29]:
y_pred_regr = l.predict(x_val)
y_pred_regr

array([7.45485702, 7.45485702, 7.45485702, ..., 7.45485702, 7.45485702,
       7.45485702])

In [30]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8854278096232179

In [31]:
tr = TweedieRegressor()
tr.fit(x_train, y_train)

TweedieRegressor()

In [32]:
y_pred_regr = tr.predict(x_val)
y_pred_regr

array([7.45586744, 7.45486961, 7.45556523, ..., 7.453452  , 7.4556972 ,
       7.45558017])

In [33]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8853460811318038

In [34]:
lsvr = LinearSVR()
lsvr.fit(x_train, y_train)

LinearSVR()

In [35]:
y_pred_regr = lsvr.predict(x_val)
y_pred_regr

array([7.31936445, 7.32494746, 7.32179528, ..., 7.59806553, 7.74238577,
       7.50790144])

In [36]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.880019468811793