In [1]:
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time
from IPython.display import clear_output

from sklearn import pipeline      # Pipeline, make_pipeline
from sklearn import preprocessing # StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import inspection      # permutation_importance
import scikitplot as skplt

In [2]:
########################################################### CLASSIFIERS

#### MULT
from sklearn.linear_model   import LogisticRegression
from sklearn.linear_model   import RidgeClassifier
from sklearn.svm            import SVC
from sklearn.svm            import NuSVC
from sklearn.svm            import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors      import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes    import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble       import StackingClassifier

#### TREE
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier, plot_tree
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
#from ngboost               import NGBClassifier
#from rgf.sklearn           import RGFClassifier, FastRGFClassifier

########################################################### REGRESSORS
from sklearn.linear_model  import ElasticNet, Ridge, Lasso, BayesianRidge, ARDRegression, TweedieRegressor
from sklearn.svm           import LinearSVR, NuSVR, SVR
from sklearn.ensemble      import BaggingRegressor
from sklearn.kernel_ridge  import KernelRidge

In [5]:
!ls ../../csv_files/FS_combos/PCA/binary_log/

test_pca_binary_log.csv  train_pca_binary_log.csv


In [6]:
path = "../../csv_files/FS_combos/PCA/binary_log/"

In [7]:
df      = pd.read_csv(path + "train_pca_binary_log.csv", index_col="Unnamed: 0")
df_test = pd.read_csv(path + "test_pca_binary_log.csv", index_col="Unnamed: 0")

In [8]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,id
0,0.510118,0.421942,0.626516,-0.167114,1.176350,-0.794320,0.132950,-0.534262,-0.258720,-0.071362,0
1,0.548365,0.835342,1.019503,0.509426,0.285459,0.664287,-0.235486,-0.102751,-0.156176,-0.103416,5
2,0.577384,0.790668,0.773640,0.367115,0.764792,0.617939,-0.167900,-0.708410,-0.196079,-0.105474,15
3,0.650529,0.224976,-0.661692,-0.181756,-0.387711,-0.700917,-0.256491,-0.832766,1.219691,0.055642,16
4,-0.946428,0.050972,0.174878,-0.983611,-0.055924,-0.311808,0.738920,0.212554,-0.145774,-0.078747,17
...,...,...,...,...,...,...,...,...,...,...,...
199995,0.759385,-0.153987,0.218206,1.503147,-0.086686,-0.097091,-0.195936,-0.071761,-0.109201,-0.079440,499987
199996,1.234107,-0.338382,0.203815,0.260633,0.195947,-0.542918,-0.062555,-0.648751,-0.174280,-0.071621,499990
199997,1.100958,0.079177,0.321608,0.813627,-0.269592,0.742756,0.751775,0.184986,0.005856,-0.106070,499991
199998,0.076422,1.365356,-0.229600,-0.017435,0.244578,0.661179,-0.334912,-0.739477,-0.200611,-0.098465,499994


### Select data for experiment

In [9]:
x = df.drop(columns=["target", 'id']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["target"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['id']) # # X_TEST DATA (NEW DATA)

In [10]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size=0.2,
    random_state=0
)

In [11]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [12]:
y_pred_regr = lr.predict(x_val)
y_pred_regr

array([7.37268774, 7.24942305, 7.43222316, ..., 7.67820865, 7.38118011,
       7.51018125])

In [13]:
from sklearn.metrics import mean_squared_error

In [14]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8708189855548024

In [15]:
#lets see what the PCA is 
from sklearn.linear_model import ARDRegression
ar = ARDRegression()
ar.fit(x_train, y_train)

ARDRegression()

In [16]:
y_pred_regr = ar.predict(x_val)
y_pred_regr

array([7.37363771, 7.24907359, 7.43294448, ..., 7.67863166, 7.38206343,
       7.51002349])

In [17]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8708189132223685

In [18]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(x_train, y_train)

BayesianRidge()

In [19]:
y_pred_regr = br.predict(x_val)
y_pred_regr

array([7.37274297, 7.2495696 , 7.43225456, ..., 7.67770172, 7.38168882,
       7.51011731])

In [20]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.870817785375026

In [21]:
en = ElasticNet()

In [22]:
en.fit(x_train, y_train)

ElasticNet()

In [23]:
y_pred_regr = en.predict(x_val)
y_pred_regr

array([7.45485702, 7.45485702, 7.45485702, ..., 7.45485702, 7.45485702,
       7.45485702])

In [24]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8854278096232179

In [25]:
r = Ridge()

In [26]:
r.fit(x_train, y_train)

Ridge()

In [27]:
y_pred_regr = r.predict(x_val)
y_pred_regr

array([7.37268837, 7.24942471, 7.43222352, ..., 7.67820288, 7.3811859 ,
       7.51018053])

In [28]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8708189715410855

In [29]:
l = Lasso()
l.fit(x_train, y_train)

Lasso()

In [30]:
y_pred_regr = l.predict(x_val)
y_pred_regr

array([7.45485702, 7.45485702, 7.45485702, ..., 7.45485702, 7.45485702,
       7.45485702])

In [31]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8854278096232179

In [32]:
tr = TweedieRegressor()
tr.fit(x_train, y_train)

TweedieRegressor()

In [33]:
y_pred_regr = tr.predict(x_val)
y_pred_regr

array([7.42615486, 7.38904547, 7.45142229, ..., 7.50060675, 7.48284386,
       7.4673085 ])

In [34]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8781903853900167

In [35]:
lsvr = LinearSVR()
lsvr.fit(x_train, y_train)

LinearSVR()

In [36]:
y_pred_regr = lsvr.predict(x_val)
y_pred_regr

array([7.3666991 , 7.1842622 , 7.43330551, ..., 7.74237122, 7.37316795,
       7.60834581])

In [37]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8762247525527861