In [48]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, power_transform, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

In [12]:
X_train = pd.read_csv('0_X_train.csv', index_col='Id')
X_valid = pd.read_csv('1_X_valid.csv', index_col='Id')
X_test  = pd.read_csv('2_X_test.csv', index_col='Id')

y_train = pd.read_csv('0_y_train.csv', index_col='Id')
y_valid = pd.read_csv('1_y_valid.csv', index_col='Id')
y_test  = pd.read_csv('2_y_test.csv', index_col='Id')

num_vars = ['age', 'time_spent', 'banner_views', 'banner_views_old', 'days_elapsed_old', 'X4']

In [27]:
transformers = [
    ('power_transform', PowerTransformer(standardize = True), num_vars)  # Power transformation and Standardization
]

std_num = ColumnTransformer(transformers=transformers, remainder='passthrough')

In [28]:
pipe_LDA = Pipeline([
    ('std_num', std_num),
    ("lda", LinearDiscriminantAnalysis())])

In [29]:
# Fit pipeline
pipe_LDA.fit(X_train, y_train) # Fit it to the training data 

# Predict on training data
y_train_pred = pipe_LDA.predict(X_train) # Predict the class labels for the training data
acc = accuracy_score(y_train, y_train_pred) # Calculate the accuracy of the predictions 
print("LDA --- Accuracy on training data:", np.round(acc, 3))

# Predict on valid data
y_valid_pred = pipe_LDA.predict(X_valid) # Predict the class labels for the valid data
acc = accuracy_score(y_valid, y_valid_pred) # Calculate the accuracy of the predictions 
print("LDA --- Accuracy on valid data:", np.round(acc, 3))

# Predict on test data
y_test_pred = pipe_LDA.predict(X_test) # Predict the class labels for the test data
acc = accuracy_score(y_test, y_test_pred) # Calculate the accuracy of the predictions 
print("LDA --- Accuracy on test data:", np.round(acc, 3))

LDA --- Accuracy on training data: 0.819
LDA --- Accuracy on valid data: 0.798
LDA --- Accuracy on test data: 0.821


  y = column_or_1d(y, warn=True)


In [49]:
# Define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluate model
scores = cross_val_score(pipe_LDA, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# Summarize result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

# https://machinelearningmastery.com/linear-discriminant-analysis-with-python/
# Running the example evaluates the Linear Discriminant Analysis algorithm on the synthetic dataset and reports the average accuracy across the three repeats of 10-fold cross-validation.

Mean Accuracy: 0.816 (0.012)


In [85]:
# Tune LDA Hyperparameters
LDA = LinearDiscriminantAnalysis()

lda_param_grid = {"solver" : ['svd', 'lsqr', 'eigen'],
              "tol" : [0.0001,0.0002,0.0003]}

gsLDA = GridSearchCV(LDA, param_grid = lda_param_grid, cv=cv,
                     scoring="accuracy", n_jobs= -1, verbose = 1)

gsLDA.fit(X_train,y_train)
LDA_best = gsLDA.best_estimator_

# Best score
gsLDA.best_score_


Fitting 30 folds for each of 9 candidates, totalling 270 fits


84 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\emann\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\emann\anaconda3\lib\site-packages\sklearn\discriminant_analysis.py", line 599, in fit
    self._solve_eigen(
  File "c:\Users\emann\anaconda3\lib\site-packages\sklearn\discriminant_analysis.py", line 440, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
  File "c:\Users\emann\anaconda3\lib\site-packages\scipy\linalg\_decomp.py", line 594, in eigh
    raise LinAlgError('The leading minor of

0.7989698566291807

In [16]:
# Define pipeline
pipe_QDA = Pipeline(
    [("power", PowerTransformer()), 
     ("qda", QuadraticDiscriminantAnalysis())])

In [17]:
# Fit pipeline
pipe_QDA.fit(X_train, y_train) # Fit it to the training data 

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('power', PowerTransformer()),
                ('qda', QuadraticDiscriminantAnalysis())])

In [18]:
# Add noise before fitting again
def add_noise(X, scale):
    """
    DataFrame double -> ndarray
    produce DataFrame adding Gaussian with standard deviation = `scale` to each column of `X`.
    """
    # Transform to ndarray
    X_arr = X.to_numpy()

    # Get shape
    n, d = X_arr.shape

    # Add Gaussian noise
    X_arr = X_arr + np.random.normal(scale = scale, size = (n, d))

    # Back to DataFrame
    df = pd.DataFrame(X_arr, columns = ["X" + str(i) for i in range(1, d + 1)])

    # Return df
    return df

X_train_noise = add_noise(X_train, scale = 0.1) # Add random noise to the input and controls the amount of the noise added
pipe_QDA.fit(X_train_noise, y_train) # Fit it to the training data 

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('power', PowerTransformer()),
                ('qda', QuadraticDiscriminantAnalysis())])

_____________________________________________________________________________________________________________________________________________________________________

https://www.datasklr.com/select-classification-methods/linear-and-quadratic-discriminant-analysis

In [93]:
import sklearn
from sklearn.preprocessing import PolynomialFeatures

# Create interaction terms (interaction of each regressor pair + polynomial)
#Interaction terms need to be created in both the test and train datasets
interaction2 = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False, order='C') #second degree
interaction3 = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False, order='C') #third degree

# Training
X_train_2 = pd.DataFrame(interaction2.fit_transform(X_train), columns=interaction2.get_feature_names(input_features=X_train.columns))
X_train_3 = pd.DataFrame(interaction3.fit_transform(X_train), columns=interaction3.get_feature_names(input_features=X_train.columns))
X_train_2.head()

# Validation
X_valid_2 = pd.DataFrame(interaction2.fit_transform(X_valid), columns=interaction2.get_feature_names(input_features=X_valid.columns))
X_valid_3 = pd.DataFrame(interaction3.fit_transform(X_valid), columns=interaction3.get_feature_names(input_features=X_valid.columns))
X_valid_2.head()

# Test
X_test_2 = pd.DataFrame(interaction2.fit_transform(X_test), columns=interaction2.get_feature_names(input_features=X_test.columns))
X_test_3 = pd.DataFrame(interaction3.fit_transform(X_test), columns=interaction3.get_feature_names(input_features=X_test.columns))
X_test_2.head()




Unnamed: 0,age,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,...,job_freelance,job_housekeeper,job_industrial_worker,job_manager,job_retired,job_salesman,job_student,job_teacher,job_technology,job_unemployed
0,1.405573,-1.759765,-0.273806,-1.154049,0.726880,-1.068475,0.335738,-0.565174,-0.565178,-0.409805,...,-0.198702,6.006407,-0.485944,-0.545999,-0.270424,-0.285418,-0.173447,-0.368459,-0.438103,-0.177792
1,0.845879,1.258243,-0.273806,-1.293811,0.726880,0.916961,-1.081376,-0.565174,-0.565178,-0.409805,...,-0.198702,-0.166489,-0.485944,-0.545999,-0.270424,-0.285418,-0.173447,-0.368459,2.282566,-0.177792
2,0.397833,1.258243,-0.273806,0.194014,-0.042649,0.778539,-1.081376,-0.565174,-0.565178,-0.409805,...,-0.198702,6.006407,-0.485944,-0.545999,-0.270424,-0.285418,-0.173447,-0.368459,-0.438103,-0.177792
3,-0.907460,-0.318701,-0.273806,1.210360,-0.446910,-0.326755,1.352297,-0.565174,-0.565178,-0.409805,...,-0.198702,-0.166489,2.057852,-0.545999,-0.270424,-0.285418,-0.173447,-0.368459,-0.438103,-0.177792
4,0.776010,-0.318701,-0.273806,-1.293811,-0.042649,-2.338993,0.882775,-0.565174,-0.565178,-0.409805,...,-0.198702,-0.166489,-0.485944,-0.545999,-0.270424,-0.285418,-0.173447,2.714005,-0.438103,-0.177792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6261,0.231171,-1.759765,3.652215,-0.393528,-0.446910,-2.178299,-1.081376,-0.565174,-0.565178,-0.409805,...,-0.198702,-0.166489,2.057852,-0.545999,-0.270424,-0.285418,-0.173447,-0.368459,-0.438103,-0.177792
6262,-0.038550,-1.759765,-0.273806,-0.274347,-0.446910,2.473490,1.352297,-0.565174,-0.565178,2.440183,...,-0.198702,-0.166489,2.057852,-0.545999,-0.270424,-0.285418,-0.173447,-0.368459,-0.438103,-0.177792
6263,0.555253,-1.759765,-0.273806,0.309596,-0.042649,-0.639832,0.882775,-0.565174,-0.565178,-0.409805,...,-0.198702,-0.166489,2.057852,-0.545999,-0.270424,-0.285418,-0.173447,-0.368459,-0.438103,-0.177792
6264,1.461172,-0.318701,-0.273806,0.309596,0.726880,-0.963067,1.560307,-0.565174,-0.565178,2.440183,...,-0.198702,-0.166489,-0.485944,-0.545999,3.697901,-0.285418,-0.173447,-0.368459,-0.438103,-0.177792


In [94]:
#############################
## Normalize all datasets 
#############################

power = PowerTransformer()


# Standardize the training sets: 1st, 2nd and 3rd order polynomials
X_train=pd.DataFrame(power.fit_transform(X_train), columns=X_train.columns)
X_train_2=pd.DataFrame(power.fit_transform(X_train_2), columns=X_train_2.columns)
X_train_3=pd.DataFrame(power.fit_transform(X_train_3), columns=X_train_3.columns)

# Standardize the validation sets: 1st, 2nd and 3rd order polynomials
X_valid=pd.DataFrame(power.fit_transform(X_valid), columns=X_valid.columns)
X_valid_2=pd.DataFrame(power.fit_transform(X_valid_2), columns=X_valid_2.columns)
X_valid_3=pd.DataFrame(power.fit_transform(X_valid_3), columns=X_valid_3.columns)

# Standardize the test sets: 1st, 2nd and 3rd order polynomials
X_test=pd.DataFrame(power.fit_transform(X_test), columns=X_test.columns)
X_test_2=pd.DataFrame(power.fit_transform(X_test_2), columns=X_test_2.columns)
X_test_3=pd.DataFrame(power.fit_transform(X_test_3), columns=X_test_3.columns)


  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())


In [105]:
################################
## Deal with multicollinearity
################################

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

#1st order polynomial ######################
x_temp_train1 = sm.add_constant(X_train)
vif_train1 = pd.DataFrame()
vif_train1["VIF Factor"] = [variance_inflation_factor(x_temp_train1.values, i) for i in range(x_temp_train1.values.shape[1])]
vif_train1["features"] = x_temp_train1.columns
pd.set_option('display.max_rows', 300)
print(vif_train1.round(1))

# Identify all variables wit VIF less then 5 and keep
vif_train1_a=vif_train1[vif_train1["VIF Factor"]<5.0]  # print(vif2.round(1))

feat_list=vif_train1_a["features"].tolist()  #save desired features to list
feat_list.remove(feat_list[0])
print(feat_list)

X_train=X_train[feat_list] #keep features on feature list only, drop all other features for train
X_valid=X_valid[feat_list] #keep features on feature list only, drop all other features for valid

    VIF Factor      features
0          1.0         const
1          1.1           age
2          1.1     education
3          1.0        device
4          1.0           day
5          1.0         month
6          1.0    time_spent
7          1.0  banner_views
8          1.0            X1
9          1.0            X2
10         1.1            X3
11         1.1            X4


In [103]:
#2nd order polynomial ####################
x_temp_train2 = sm.add_constant(X_train_2)
vif_train2 = pd.DataFrame()
vif_train2["VIF Factor"] = [variance_inflation_factor(x_temp_train2.values, i) for i in range(x_temp_train2.values.shape[1])]
vif_train2["features"] = x_temp_train2.columns
pd.set_option('display.max_rows', 300)
#print(vif_train1.round(1))

vif_train2_a=vif_train2[vif_train2["VIF Factor"]<5.0]
#print(vif2.round(1))

feat_list2=vif_train2_a["features"].tolist()
feat_list2.remove(feat_list2[0])
print(feat_list2)

X_train_2=X_train_2[feat_list2] #keep features on feature list only, drop all other features for train
X_valid_2=X_valid_2[feat_list2] #keep features on feature list only, drop all other features for valid
X_valid_2

  vif = 1. / (1. - r_squared_i)


['day', 'month', 'time_spent', 'banner_views', 'X4', 'age^2', 'age education', 'age device', 'age day', 'age month', 'age time_spent', 'age banner_views', 'age X1', 'age X2', 'age X3', 'age X4', 'age job_entrepreneur', 'age job_housekeeper', 'age job_unemployed', 'education device', 'education day', 'education month', 'education time_spent', 'education banner_views', 'education X1', 'education X2', 'education X3', 'education X4', 'education job_entrepreneur', 'education job_freelance', 'education job_housekeeper', 'education job_salesman', 'education job_student', 'education job_teacher', 'education job_unemployed', 'device day', 'device month', 'device time_spent', 'device banner_views', 'device X1', 'device X2', 'device X3', 'device X4', 'day month', 'day time_spent', 'day banner_views', 'day X1', 'day X2', 'day X3', 'day X4', 'month^2', 'month time_spent', 'month banner_views', 'month X1', 'month X2', 'month X3', 'month X4', 'time_spent^2', 'time_spent banner_views', 'time_spent X1'

Unnamed: 0,day,month,time_spent,banner_views,X4,age^2,age education,age device,age day,age month,...,X1 X4,X2 X3,X2 X4,X3 X4,X4^2,X4 job_entrepreneur,X4 job_freelance,X4 job_housekeeper,X4 job_student,X4 job_unemployed
0,1.284243,0.718975,-1.004293,1.572205,0.924993,6.800116e-16,1.657017,-0.266407,1.649631,1.164174,...,2.682420,-0.082138,-0.102636,-0.974720,0.819998,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521
1,1.380305,-0.460421,-0.864125,1.424980,-0.586173,6.800116e-16,-0.164368,-0.266407,1.042586,-0.379500,...,-0.372962,-0.082138,-0.102636,0.864696,-0.607027,-0.179674,5.251140,-0.188325,-0.196646,-0.17521
2,-0.459908,1.814169,0.796361,0.271268,2.088788,6.800116e-16,-1.880754,-0.266407,0.537626,2.776608,...,-0.372962,-0.082138,-0.102636,-0.974720,2.323232,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521
3,-0.212258,-0.460421,-0.700354,0.271268,-0.244794,6.800116e-16,-1.880754,-0.266407,-0.266227,-0.467486,...,-0.372962,-0.082138,-0.102636,0.914515,-0.326669,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521
4,1.569285,0.337028,-1.391977,1.946463,-0.631736,6.800116e-16,-0.611975,-0.266407,0.424628,-0.512616,...,-0.372962,-0.082138,-0.102636,0.858466,-0.642970,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338,1.475304,-2.282042,-0.387055,-1.037391,0.396987,6.800116e-16,-0.132164,-0.266407,1.161026,-2.330341,...,2.682066,-0.082138,-0.102636,1.028076,0.261388,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521
1339,-0.588697,-1.319180,-0.069579,-1.037391,1.251139,6.800116e-16,0.521628,-0.266407,0.103492,-0.440780,...,2.682578,-0.082138,-0.102636,1.257447,1.207462,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521
1340,1.187066,0.718975,-0.677663,1.678829,-0.935608,6.800116e-16,1.275583,-0.266407,1.228224,0.810170,...,-0.372962,-0.082138,-0.102636,-0.974720,-0.874559,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521
1341,0.577133,-0.055479,-2.492769,0.871798,0.890611,6.800116e-16,1.076677,-0.266407,0.573011,0.066817,...,-0.372962,-0.082138,-0.102636,1.143923,0.781181,-0.179674,-0.190435,-0.188325,-0.196646,-0.17521


In [104]:
#3rd order polynomial ####################
x_temp_train3 = sm.add_constant(X_train_3)
vif_train3 = pd.DataFrame()
vif_train3["VIF Factor"] = [variance_inflation_factor(x_temp_train3.values, i) for i in range(x_temp_train3.values.shape[1])]
vif_train3["features"] = x_temp_train3.columns
pd.set_option('display.max_rows', 300)
#print(vif_train3.round(1))

vif_train3_a=vif_train3[vif_train3["VIF Factor"]<5.0]
#print(vif3.round(1))

feat_list3=vif_train3_a["features"].tolist()
feat_list3.remove(feat_list3[0])
print(feat_list3)

X_train_3=X_train_3[feat_list3] #keep features on feature list only, drop all other features for train
X_valid_3=X_valid_3[feat_list3]   #keep features on feature list only, drop all other features for valid

In [None]:
# Default LDA model without any tuning - base metric
LDA_model_default = LinearDiscriminantAnalysis()
LDA_model_default.fit(X_train, y_train)
y_pred_LDA_default =LDA_model_default.predict(X_test)