In [1]:
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time
from IPython.display import clear_output

from sklearn import pipeline      # Pipeline, make_pipeline
from sklearn import preprocessing # StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import inspection      # permutation_importance
import scikitplot as skplt

In [2]:
########################################################### CLASSIFIERS

#### MULT
from sklearn.linear_model   import LogisticRegression
from sklearn.linear_model   import RidgeClassifier
from sklearn.svm            import SVC
from sklearn.svm            import NuSVC
from sklearn.svm            import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors      import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes    import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble       import StackingClassifier

#### TREE
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier, plot_tree
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
#from ngboost               import NGBClassifier
#from rgf.sklearn           import RGFClassifier, FastRGFClassifier

########################################################### REGRESSORS
from sklearn.linear_model  import ElasticNet, Ridge, Lasso, BayesianRidge, ARDRegression, TweedieRegressor
from sklearn.svm           import LinearSVR, NuSVR, SVR
from sklearn.ensemble      import BaggingRegressor
from sklearn.kernel_ridge  import KernelRidge

In [4]:
!ls ../../csv_files/FS_combos/UNI/freq_num/

UNI_test_freq_num.csv  UNI_train_freq_num.csv


In [5]:
path = "../../csv_files/FS_combos/UNI/freq_num/"

In [6]:
df      = pd.read_csv(path + "UNI_train_freq_num.csv", index_col="Unnamed: 0")
df_test = pd.read_csv(path + "UNI_test_freq_num.csv", index_col="Unnamed: 0")

### Select data for experiment

In [7]:
x = df.drop(columns=["target", 'id']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["target"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['id']) # # X_TEST DATA (NEW DATA)

In [8]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size=0.2,
    random_state=0
)

In [9]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [10]:
y_pred_regr = lr.predict(x_val)
y_pred_regr

array([7.38269783, 7.35437693, 7.35129868, ..., 7.76869206, 7.54688471,
       7.39350518])

In [11]:
from sklearn.metrics import mean_squared_error

In [12]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8737656748827035

In [13]:
#lets see what the PCA is 
from sklearn.linear_model import ARDRegression
ar = ARDRegression()
ar.fit(x_train, y_train)

ARDRegression()

In [14]:
y_pred_regr = ar.predict(x_val)
y_pred_regr

array([7.38152091, 7.35627844, 7.35268825, ..., 7.76862773, 7.54576691,
       7.39302713])

In [15]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.873762527579866

In [16]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(x_train, y_train)

BayesianRidge()

In [17]:
y_pred_regr = br.predict(x_val)
y_pred_regr

array([7.38299485, 7.35457299, 7.35156157, ..., 7.76855013, 7.54656845,
       7.39372887])

In [18]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8737661024312509

In [19]:
en = ElasticNet()

In [20]:
en.fit(x_train, y_train)

ElasticNet()

In [21]:
y_pred_regr = en.predict(x_val)
y_pred_regr

array([7.45485702, 7.45485702, 7.45485702, ..., 7.45485702, 7.45485702,
       7.45485702])

In [22]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8854278096232179

In [23]:
r = Ridge()

In [24]:
r.fit(x_train, y_train)

Ridge()

In [25]:
y_pred_regr = r.predict(x_val)
y_pred_regr

array([7.38292813, 7.35452894, 7.35150252, ..., 7.76858203, 7.54663949,
       7.39367862])

In [26]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8737659969411106

In [27]:
l = Lasso()
l.fit(x_train, y_train)

Lasso()

In [28]:
y_pred_regr = l.predict(x_val)
y_pred_regr

array([7.45485702, 7.45485702, 7.45485702, ..., 7.45485702, 7.45485702,
       7.45485702])

In [29]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8854278096232179

In [30]:
tr = TweedieRegressor()
tr.fit(x_train, y_train)

TweedieRegressor()

In [31]:
y_pred_regr = tr.predict(x_val)
y_pred_regr

array([7.45571576, 7.45370136, 7.45519953, ..., 7.46752268, 7.45510297,
       7.45486487])

In [32]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8848960843361352

In [33]:
lsvr = LinearSVR()
lsvr.fit(x_train, y_train)

LinearSVR()

In [34]:
y_pred_regr = lsvr.predict(x_val)
y_pred_regr

array([7.31344979, 7.36575079, 7.25941325, ..., 7.92650178, 7.55929004,
       7.458503  ])

In [35]:
mse = mean_squared_error(y_pred_regr, y_val, squared=False)
mse

0.8777918112517679