## Steps to follow

1. Get train test split
1. Fit default model. Also fit using cross validation
2. Evaluate the model
3. Change parameters and hyperparameters
4. Evaluate all models
5. Compare all models
6. Find out feature importance

### Load the libraries

In [243]:
# imoprt libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,cross_validate
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import log_loss,classification_report,confusion_matrix, roc_curve, roc_auc_score

import matplotlib.pyplot as plt

# over sampling for imbalanced classes
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline

### Load the data

In [244]:
# should I use normalized data for all models?
# using pca components instead of normal features is a good idea? I don't think so.
# for logistic regression it is better to use nomralized data

df = pd.read_csv("../data/train_norm.csv")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,target
0,0.402093,-0.210106,-0.307165,-0.279443,-0.161867,-0.119331,-0.188045,-0.293664,-0.291038,-0.243606,...,0.2461,-0.42087,-0.249802,-0.413584,-0.299712,-0.176699,-0.129516,-0.386938,-0.104963,1
1,-0.253508,-0.210106,-0.307165,-0.279443,-0.161867,-0.119331,-0.188045,0.149647,-0.291038,-0.243606,...,-0.280099,-0.42087,-0.249802,-0.413584,-0.299712,-0.176699,-0.129516,-0.386938,-0.104963,1
2,-0.253508,-0.210106,-0.307165,-0.279443,-0.161867,-0.119331,-0.188045,0.149647,-0.291038,-0.243606,...,-0.280099,-0.42087,-0.249802,-0.413584,-0.299712,-0.176699,-0.129516,-0.386938,-0.104963,1
3,0.402093,-0.210106,-0.307165,0.07924,13.50871,4.524667,4.665884,-0.293664,-0.291038,0.679472,...,-0.280099,-0.047949,1.019683,-0.413584,-0.299712,-0.176699,-0.129516,-0.386938,-0.104963,1
4,-0.253508,-0.210106,-0.307165,-0.279443,-0.161867,-0.119331,-0.188045,-0.293664,-0.291038,-0.243606,...,0.2461,-0.42087,-0.249802,-0.413584,-0.299712,0.040798,-0.129516,-0.386938,-0.104963,1


### Split the data into train test

In [245]:
# here the classes are imbalanced so we should use stratified split 
# the folds are made by preserving the percentage of samples for each class
# note that the imbalance will still be their when we train the model using this split

# since we have big amount of data our test set can be just 1% of all data
sss = StratifiedShuffleSplit(n_splits=1,test_size=0.1, random_state=42)
print(sss)

StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.1,
            train_size=None)


In [246]:
X = df.drop("target",axis=1)
y = df.target

train_index = []
test_index = []

for tr, tes in sss.split(X,y):
    print("TRAIN:", tr, "TEST:", tes)
    train_index = tr
    test_index = tes

X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]


TRAIN: [57972 30244  9427 ..., 60232 28576 27516] TEST: [59081 21681 51999 ...,  1777   269 53901]


In [247]:
print("Shapes of data sets")
print("X_train: ", X_train.shape, "y_train: ", y_train.shape)
print("X_train: ", X_test.shape,"y_test: ", y_test.shape)

Shapes of data sets
X_train:  (55690, 93) y_train:  (55690,)
X_train:  (6188, 93) y_test:  (6188,)


### Model evaluation function

In [248]:
def evaluate_model(m):
    print("Train score: ",m.score(X_train,y_train).round(5))
    print("Test score: ",m.score(X_test,y_test).round(5))
    print("Log loss train: ",log_loss(y_train, m.predict_proba(X_train)).round(5))
    print("Log loss test: ",log_loss(y_test, m.predict_proba(X_test)).round(5))
    print("\nConfusion Matrix: \n", confusion_matrix(y_test, m.predict(X_test),labels=m.classes_))
    print("\nClassification Report: \n", classification_report(y_test, m.predict(X_test)))
    

### Feature importance function

In [249]:
def feature_imp(m):
    coef_df = pd.DataFrame(m.coef_)
    coef_df = np.absolute(np.round(coef_df,5))

    imp_features = [0]*9
    for row in np.arange(0,9,1):
        # top coefficients for each class
        imp_features[row] = sorted(enumerate(coef_df.iloc[row]), \
                                   key=lambda x:x[1],reverse=True)[0:5]

    print("Most important features: \n",np.unique(np.transpose(imp_features)[0])+1)


### Fit on default model

In [13]:
lr = LogisticRegression()

In [14]:
lr.fit(X=X_train,y=y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [218]:
evaluate_model(lr)

Train score:  0.75716
Test score:  0.75533
Log loss train:  0.66281
Log loss test:  0.67151

Confusion Matrix: 
 [[  56   27    0    0    0   21    2   41   46]
 [   0 1438  138    5    8    6   10    5    2]
 [   0  565  217    3    0    1   10    3    1]
 [   0  168   29   45    4   19    4    0    0]
 [   0   15    2    0  257    0    0    0    0]
 [   3   34    1    3    0 1301   23   29   20]
 [   2   55   20    1    0   30  159   14    3]
 [  10   16    2    0    0   27   10  772    9]
 [   5   26    0    1    0   15    1   19  429]]

Classification Report: 
              precision    recall  f1-score   support

          1       0.74      0.29      0.42       193
          2       0.61      0.89      0.73      1612
          3       0.53      0.27      0.36       800
          4       0.78      0.17      0.28       269
          5       0.96      0.94      0.95       274
          6       0.92      0.92      0.92      1414
          7       0.73      0.56      0.63       284
   

In [201]:
feature_imp(lr)

Most important features: 
 [  9.  11.  14.  15.  26.  27.  34.  39.  40.  42.  43.  45.  47.  58.  60.
  69.  73.  75.  76.  78.  83.  84.  86.  90.]


### Use cross validation

In [59]:
# using LogisticRegressionCV
lrcv = LogisticRegressionCV(cv=5)

In [60]:
lrcv.fit(X=X_train,y=y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [184]:
evaluate_model(lrcv)

Train score:  0.75725
Test score:  0.75727
Log loss:  0.6707

Confusion Matrix: 
 [[  57   25    0    0    0   21    3   41   46]
 [   0 1442  137    4    6    6   10    5    2]
 [   0  561  221    3    0    1   10    3    1]
 [   0  168   30   44    4   19    4    0    0]
 [   0   13    2    0  259    0    0    0    0]
 [   3   35    1    3    1 1302   23   27   19]
 [   2   53   20    1    1   30  160   14    3]
 [  11   17    1    0    0   26    9  773    9]
 [   5   27    0    1    0   15    1   19  428]]

Classification Report: 
              precision    recall  f1-score   support

          1       0.73      0.30      0.42       193
          2       0.62      0.89      0.73      1612
          3       0.54      0.28      0.36       800
          4       0.79      0.16      0.27       269
          5       0.96      0.95      0.95       274
          6       0.92      0.92      0.92      1414
          7       0.73      0.56      0.63       284
          8       0.88      0.91  

In [202]:
feature_imp(lrcv)

Most important features: 
 [  9.  11.  14.  15.  26.  27.  36.  39.  40.  42.  43.  45.  47.  58.  59.
  60.  68.  69.  73.  75.  76.  83.  84.  86.  90.]


### Use cross validation library

In [216]:
cross_validate(estimator=lr,cv=3,X=X_train,y=y_train,scoring=('neg_log_loss'))

{'fit_time': array([ 15.76905894,  15.40101194,  14.45040584]),
 'score_time': array([ 0.01995587,  0.0199542 ,  0.01813412]),
 'test_score': array([-0.68030046, -0.67570131, -0.6716733 ]),
 'train_score': array([-0.65810914, -0.66169568, -0.66303738])}

### Naive random over sampling

In [250]:
rs = RandomOverSampler(random_state=42)

In [254]:
X_os, y_os = rs.fit_sample(X_train,y_train)

In [255]:
X_os.shape, y_os.shape

((130590, 93), (130590,))

In [256]:
lr = LogisticRegression()

In [257]:
lr.fit(X=X_os,y=y_os)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [258]:
evaluate_model(lr)

Train score:  0.71501
Test score:  0.71025
Log loss train:  0.78172
Log loss test:  0.79707

Confusion Matrix: 
 [[ 140    2    2    2    2    3    7   14   21]
 [  23  909  329  291   17    3   32    3    5]
 [   2  205  371  175    2    1   41    0    3]
 [   0   38   28  181    5    7   10    0    0]
 [   1    3    2    0  268    0    0    0    0]
 [  58    7    4   22    2 1215   46   21   39]
 [  19    7   16   17    0    3  215    4    3]
 [  81    3    4    0    0   12   19  714   13]
 [  79    5    1    5    0   10    3   11  382]]

Classification Report: 
              precision    recall  f1-score   support

          1       0.35      0.73      0.47       193
          2       0.77      0.56      0.65      1612
          3       0.49      0.46      0.48       800
          4       0.26      0.67      0.38       269
          5       0.91      0.98      0.94       274
          6       0.97      0.86      0.91      1414
          7       0.58      0.76      0.65       284
   

In [259]:
feature_imp(lr)

Most important features: 
 [ 11.  14.  15.  19.  26.  34.  39.  43.  45.  47.  58.  59.  60.  69.  73.
  75.  76.  78.  83.  84.  86.  90.  91.  92.]


Above naive random over sampling did worse results then our original data.