In [108]:
import os
import pandas as pd
import datetime
import time
import numpy as np
import json # Export dictionary of tuned parameters.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV # For hyperparameter tuning.



# Evaluation metrics import
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score, 
    confusion_matrix
)

# Models import
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# Visualisations import
import seaborn as sns
import matplotlib.pyplot as plt

# TIPS:
FEATURE SELECTION:

Feature selection methods, LASSO, and other lp penalties, LASSO IS GREAT because it sets to 0. Others I haven't tested.

FORGET ABOUT AIC, BIC, subset selection, we can't fit those models it takes too long.

Play around Lasso lambda to get more or less features.

BE GREEDY! lASSO on subsets of the data. keep more variables as well. Low time approaches.

https://stats.stackexchange.com/questions/27300/using-principal-component-analysis-pca-for-feature-selection (idk if this says something useful might be interesting to check.) And read the p>>n chapter of book for other methods. 

<hr>
CLASSIFICATION:

DICHOTOMY: CLASSIFICATION AND FEATURE SELECTION TO BE TREATED LIKE ORTHOGONAL ELEMENTS.

Get variables from Lasso non zero coefficients and then fit a model, go big, svm, rf, whatever, don't try to classify using a Lasso. 

<hr>
WORKFLOW:

If we don't hyper parameter tune there is no reason we can't use CV for an estimate of the test error. We have few datapoints, every point is worth saving. ONLY IF WE DON'T HYPER PARAM TUNE WE CAN DO THIS, but baseline classification algorithms are already very good!.

CLASSES VERY UNBALANCED, WE NEED TO TAKE THIS INTO ACCOUNT:
REWEIGHTING things might be good for this. Some algorithms have options to reweight like rf or whatever.



In [109]:
#This is for data, which is most important split.
RANDOM_STATE = 42

In [110]:
def load_data(file_path):
    return pd.read_csv(file_path, compression='gzip')


In [111]:
file_path = "Datasets/data2.csv.gz" 

# Load the dataset
data = load_data(file_path)

In [112]:
data

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
796,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
797,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [211]:
data["label"].value_counts()
# DATA IS VERY UNBALANCED, SOME ESTIMATORS HAVE OPTIONS TO REWEIGHT THE DATA!!

label
-1    722
 1     78
Name: count, dtype: int64

In [113]:
def train_split(data,label_col,random_state,test_size=0.2):
    """
    Needs to be like this now as we can't preprocess the train data as we will run cross validation on it, therefore we need to preprocess
    on each split.

    """
    X = data.drop(columns=[label_col]).values
    y = LabelEncoder().fit_transform(data[label_col].values)
    
        # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train,X_test,y_train, y_test 

In [208]:
# IDEALLY THIS IS NOT NECESSARY VIA Cross validation if we don't hyperparameter tune.
X_TRAIN_G,X_TEST_G,y_TRAIN_G,y_TEST_G = train_split(data,label_col="label",random_state=RANDOM_STATE,test_size=0.2)

In [195]:

X_TRAIN_G.shape

(640, 100000)

In [196]:
# The variables are 1 0 exist no exist.
np.unique(X_TRAIN_G)


array([0, 1])

In [197]:
X_TRAIN_G[:,1:100].shape

(640, 99)

In [198]:
#[:,1:100000]

In [199]:
tic = time.time()
lasso = Lasso(alpha=0.01)

lasso.fit(X_TRAIN_G,y_TRAIN_G)
toc = time.time()

print(toc-tic)
# Try this it is super fast.

0.7985219955444336


In [200]:
x_subset = X_TRAIN_G[:,lasso.coef_!=0]

In [201]:
# Try a random forest.
rf = RandomForestClassifier()
rf.fit(X_TEST_G[:,lasso.coef_!=0],y_TEST_G)
y_pred = rf.predict(X_TEST_G[:,lasso.coef_!=0])

In [202]:
rf

In [203]:
# svc = SVC()
# #Fit using only the variables lasso didn't set to 0.
# svc.fit(X_TEST_G[:,lasso.coef_!=0],y_TEST_G)
# y_pred = svc.predict(X_TEST_G[:,lasso.coef_!=0])


In [204]:

balanced_accuracy = balanced_accuracy_score(y_TEST_G, y_pred)


In [212]:
# This gets worse the bigger the test, meaning we train with less data. Try pure CV and reweighting methods.
balanced_accuracy
# 0.9 balanced accuracy as baseline if I am not mistaken?

np.float64(0.9027777777777778)

In [206]:
print(balanced_accuracy)

0.9027777777777778


In [207]:
np.unique(lasso.coef_)

array([0.        , 0.0058284 , 0.04093832, 0.04689534, 0.11708568,
       0.19455857, 0.27462667])