# Classification. Linear models and KNN

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     cross_val_score, cross_validate,
                                     train_test_split)
from sklearn.metrics import plot_confusion_matrix, accuracy_score, recall_score, make_scorer
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [27]:
# Code for calculating Normalized gini coefficient
# https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
def gini(actual, pred, cmpcol = 0, sortcol = 1):  
    assert(len(actual) == len(pred))  
    epsilon = 1e-7
    values = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)  
    values = values[np.lexsort((values[:, 2], -1 * values[:, 1]))]
    total = values[:, 0].sum()
    gini_sum = (values[:, 0].cumsum().sum() + epsilon) / (total + epsilon)  
    gini_sum -= (len(actual) + 1) / 2  
    return gini_sum / len(actual)  
  
def gini_normalized(a, p):  
    '''Function to calculate the normalized gini coefficient'''
    return gini(a, p) / gini(a, a)

In [53]:
gini_score = make_scorer(gini_normalized, greater_is_better=True, needs_proba=True)

In [None]:
PATH = "./data/"

In [111]:
train_data = pd.read_csv(os.path.join(PATH, 'porto', 'train.csv')).set_index('id')
test_data = pd.read_csv(os.path.join(PATH, 'porto', 'test.csv')).set_index('id')

dataset_original = train_data.append(test_data)

In [118]:
#rebuild from here to save your time

ntrain = 595212
dataset = dataset_original
train_data = dataset[:ntrain]
test_data = dataset[ntrain:]

In [119]:
print('Missing Values in %:')
(train_data == -1).sum().sort_values(ascending = False)[:5]/train_data.shape[0]

Missing Values in %:


ps_car_03_cat    0.690898
ps_car_05_cat    0.447825
ps_reg_03        0.181065
ps_car_14        0.071605
ps_car_07_cat    0.019302
dtype: float64

In [120]:
# drop top3 missing values-columns
dataset = dataset.drop(columns = ['ps_car_03_cat', 'ps_car_05_cat', 'ps_reg_03'])

for column in dataset:
    dataset[column]=dataset[column].replace(-1,dataset[column].median())
    
train_data = dataset[:ntrain]
test_data = dataset[ntrain:]

In [121]:
train_data.groupby('target').size()

target
0.0    573518
1.0     21694
dtype: int64

In [117]:
#let the number of 0-class samples be comparable to the numbers of 1-class samples.
# NOT EFFECTIVE

train_data = train_data[train_data['target'] == 1].append(
                  train_data[train_data['target'] == 0].sample(n=30000, random_state=1))

dataset = train_data.append(test_data)
ntrain = train_data.shape[0]

In [65]:
train_data.groupby('target').size()

target
0.0    30000
1.0    21694
dtype: int64

In [128]:
test_indexes = dataset[ntrain:].index

In [101]:
columns = dataset.drop(columns=['target']).columns

In [102]:
numeric_features = [column for column in columns if not (column.endswith("cat") or column.endswith("bin"))]
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_features = [column for column in columns if (column.endswith("cat") or column.endswith("bin"))]
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [131]:
dataset = preprocessor.fit_transform(dataset)

In [132]:
X = dataset[:ntrain]
y = train_data['target']

test_X = dataset[ntrain:]

In [37]:
#lets find out, whether some of the features gives high gini by RandForest

res = []
c = 0
for i in range(X.shape[1]):
    xtrain = X[:,i]
    v = 0
    
    for j in range(2):
        X_train,X_test,y_train,y_test = train_test_split(xtrain,y,test_size=0.3,stratify = y)
        rfr = RandomForestRegressor(n_estimators=100,random_state=1342, n_jobs=8)
        rfr.fit(X=X_train,y=y_train)        
        pred = rfr.predict(X_test )
        v+= gini_normalized(y_test, pred)
    res.append((v,i))

    c+=1
    if c % 10 ==0:
        print(c)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230


In [38]:
res.sort(reverse=True)
best = []
for i in range(100):
    best.append(res[i])
new_columns = [i[1] for i in best]

In [41]:
#best.sort(reverse=True)

#print([i[0]  for i in best])
#print([i[1] for i in best])

#[8, 74, 57, 9, 56, 31, 107, 5, 103, 102]
#[7, 9, 5, 1, 39, 10, 38, 4, 41, 40, 55, 54, 72, 73, 57]

[7, 9, 5, 1, 39, 10, 38, 4, 41, 40, 55, 54, 72, 73, 57, 3, 74, 67, 56, 31, 0, 71, 6, 104, 217, 105, 106, 69, 42, 43, 107, 95, 30, 29, 37, 177, 83, 76, 102, 101, 134, 66, 204, 25, 99, 103, 78, 211, 85, 82, 58, 32, 65, 93, 68, 226, 35, 199, 84, 75, 45, 19, 2, 212, 44, 94, 201, 100, 59, 26, 183, 51, 91, 152, 151, 210, 8, 216, 11, 205, 213, 17, 161, 96, 89, 156, 124, 49, 92, 70, 202, 158, 113, 143, 115, 179, 178, 145, 141, 157]


In [133]:
X_new = np.take(X.toarray(), new_columns, axis = 1)

In [156]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [143]:
tuned_parameters = {'n_estimators': np.arange(100, 1000, 10), 
                     'criterion': ['gini', 'entropy'],
                     'max_depth': np.arange(2, 50, 2),
                     'max_features': np.arange(2, 200, 5)}

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)

rs = RandomizedSearchCV(
        rfc, tuned_parameters, n_iter=30, scoring=gini_score, n_jobs=-1, cv=3, random_state=42, verbose=10)
rs.fit(X_train, y_train)

In [None]:
#print(rs.best_params_)
#print(rs.best_score_)

In [157]:
# lets try Classifier
rfc = RandomForestClassifier(
    n_estimators = 900, max_depth = 15, criterion = 'entropy', max_features = 'sqrt', n_jobs=4)
rfc.fit(X=X_train,y=y_train)

pred_proba = rfc.predict_proba(X_test)[:, 1]
pred_bin = rfc.predict(X_test)

print('Accuracy for RFC:',accuracy_score(y_test,pred_bin))
print('Gini for RFC:',gini_normalized(y_test, pred_proba))

Accuracy for RFC: 0.9635200824354293
Gini for RFC: 0.2726819279075956


In [158]:
# PREPARING DATA FOR FINAL ESTIMATION
#X_test = np.take(dataset[ntrain:].toarray(), new_columns, axis = 1)
X_test = dataset[ntrain:].toarray()
pred_proba = rfc.predict_proba(X_test)[:, 1]

In [160]:
# SUBMITTION
submission = pd.DataFrame({
        "id": test_indexes,
        "target": pred_proba
    })
submission.to_csv(os.path.join(PATH, 'porto', 'submission2.csv'), index=False)

In [161]:
pd.read_csv(os.path.join(PATH, 'porto', 'submission2.csv')).set_index('id')

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0.028446
1,0.027333
2,0.029578
3,0.019194
4,0.035664
...,...
1488022,0.074033
1488023,0.043686
1488024,0.040679
1488025,0.025179
