In [1]:
import numpy as np
import pandas as pd           # for reading file
import pandas_profiling as pp # statistical visualise
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [3]:
train_df = pd.read_csv('data/train.csv', index_col='Unnamed: 0', sep='\t')
test_df = pd.read_csv('data/test.csv', index_col='Unnamed: 0', sep='\t')

In [3]:
# train_df.head(1)
# test_df.head(1)

In [4]:
print ("rows \t\t: ", train_df.shape[0])
print ("columns \t: ", train_df.shape[1])

rows 		:  30500
columns 	:  346


---
## 1. Опишите препроцессинг данных, инжиниринг фич и валидацию

#### Разбиваем выборку

In [5]:
X_train = train_df.drop(['0'], axis=1)
y_train = train_df['0']

X_validation = test_df.drop(['0'], axis=1)
y_validation = test_df['0']

In [6]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 345)
	 (30500,)
Test : 
	 (4166, 345)
	 (4166,)


#### Удаляем признаки у которых уровень корреляция больше 99

In [10]:
# Удаляем признаки с корреляцией 1
def    check_corr_data(train, test, type_corr):
    corr_matrix = train.corr(method=type_corr).abs()
    
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    to_drop = []
    for column in upper.columns:
        if any(upper[column] > 0.99):
            train.drop(column, axis=1, inplace=True)
            test.drop(column, axis=1, inplace=True)
            print (column)
            return True
    return False

def     clean_corr_data(train, test, type_corr):
    while check_corr_data(train, test, type_corr):
        ;

In [11]:
clean_corr_data(X_train, X_validation, 'pearson')

329
333
334


In [12]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 103)
	 (30500,)
Test : 
	 (4166, 103)
	 (4166,)


#### Кодируем категориальные признаки one_hot_encoding

In [13]:
def one_hot_encoding(train, test):
#     test_length = test.shape()[0]
    for i in train:
        if len(train[i].unique()) < 15 and len(train[i].unique()) > 1:
            train2 = pd.concat([train, pd.get_dummies(train[i], prefix=i)], axis=1)
            train.drop([i], axis=1, inplace=True)
            train = train2
            
            test2 = pd.concat([test, pd.get_dummies(test[i], prefix=i)], axis=1)
            test.drop([i], axis=1, inplace=True)
            test = test2
#     test = test[:test_length]
    return train, test

In [14]:
X_train, X_validation = one_hot_encoding(X_train, X_validation);

In [15]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 221)
	 (30500,)
Test : 
	 (4166, 217)
	 (4166,)


#### Опять чистим коррелирующие данные

In [16]:
clean_corr_data(X_train, X_validation, 'pearson')

120_0.0
127_0.0
212_1.0
312_0.0
312_0.39424
312_0.4324508
312_0.47097
312_0.5197096
312_0.5722851
312_0.588824
312_0.6608119
312_0.7575137
312_0.8421118000000001
312_0.9310675
312_1.0
313_0.0
313_0.016691400000000002
313_0.052069500000000005
313_0.0654514
313_0.1504556
313_0.2396119
313_0.277947
313_0.3472929
313_0.461724
313_0.6188458
313_1.0
314_0.0
314_0.1178385
314_0.1699219
314_0.202474
314_0.2057292
314_0.4466146
314_0.4889323
314_0.5833333
314_0.6126302
314_1.0
315_0.015435399999999997
315_0.033313800000000005
315_0.07876389999999997
315_0.3415823
315_0.4127147
315_0.4488797000000001
315_0.5901081
315_0.6529206
315_0.8525990999999999
315_0.9929259
315_1.0


In [17]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 174)
	 (30500,)
Test : 
	 (4166, 170)
	 (4166,)


---

In [18]:
X_train.drop(list(set(X_train.columns) ^ set(X_validation.columns)), axis=1, inplace=True)

In [19]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 170)
	 (30500,)
Test : 
	 (4166, 170)
	 (4166,)


In [20]:
# pp.ProfileReport(X_test)
# pp.ProfileReport(X_train)

#### разбиваем на train и test

In [24]:
from sklearn.model_selection import train_test_split
X_train_1, X_test, y_train_1, y_test = train_test_split(X_train, 
                                                   y_train,
                                                   test_size=0.10, 
                                                   random_state=42,
                                                   stratify=y_train)

---
## 2. Постройте логистическую регрессию. Укажите значение на валидации и публичном лидерборде

In [32]:
print ("Train : ")
print ("\t", X_train_1.shape)
print ("\t", y_train_1.shape)
print ("Test : ")
print ("\t", X_test.shape)
print ("\t", y_test.shape)

Train : 
	 (27450, 170)
	 (27450,)
Test : 
	 (3050, 170)
	 (3050,)


In [6]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(random_state=42, class_weight='balanced')

params = {
    'C' : [.01, .1, .5, 1, 1.5, 2],
    'class_weight' : [None, 'balanced'],
#     'intercept_scaling' : [np.arange(0, 20, 1)],
    'penalty' : ['l2', 'l1'],
    'tol' : [0.0001, 0.001, 0.002, 0.004]
}

best_params = {
    'C': [0.5], 
    'class_weight': ['balanced'],
    'penalty': ['l1'], 
    'tol': [0.0001]
}

In [7]:
log_reg = GridSearchCV(log, best_params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
log_reg.fit(X_train_1, y_train_1)

print (log_reg.best_params_)
print (log_reg.best_score_)

NameError: name 'GridSearchCV' is not defined

In [29]:
pred = log_reg.predict(X_test)

#### Rank local 

In [30]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred)

0.6343168650729838

---

In [32]:
X_test.shape, X_validation.shape

((4270, 170), (4166, 170))

In [34]:
solution = pd.DataFrame(log_reg.predict(X_validation))
solution.to_csv('solution.csv')

    Rank site  : 0.62911393    
    Rank local : 0.63725348

---
## 3. Постройте бэггинг на логистических регрессиях. Укажите значение на валидации и публичном лидерборде

In [None]:
# 0.51

---
## 4. Постройте случайный лес. Укажите значение на валидации и публичном лидерборде

In [20]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor

In [30]:
# max_depth_values = list(range(8, 24)) + [None]
# max_features_values = range(4, 8) + [None]
max_depth_values = [None]
max_features_values = [None]
forest_params = {'max_depth': max_depth_values,
                 'max_features': max_features_values}

In [34]:
forest = RandomForestClassifier(n_estimators=100, random_state=17)
gsv = GridSearchCV(forest, forest_params, cv=5, n_jobs = -1, verbose=1, scoring='roc_auc')
gsv.fit(X_train_1, y_train_1)
print (gsv.best_params_)
# print (gsv.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.7min finished


{'max_depth': None, 'max_features': None}


In [35]:
pred = gsv.predict(X_test)

#### Rank local 

In [36]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred)

0.5594702119269996

---

In [37]:
X_test.shape, X_validation.shape

((3050, 170), (4166, 170))

In [39]:
solution = pd.DataFrame(gsv.predict(X_validation))
solution.to_csv('solution.csv')

    Rank site  : 0.54844339
    Rank local : 0.55947021

---
## 5. Подберите лучший вариант простого ансамбля. Опишите его и укажите значение на валидации и публичном лидерборде

---
## 6. Сделайте стекинг. Опишите его и укажите значение на валидации и публичном лидерборде

---
## 7. Укажите ваш ник, значение на lb и валидации, опишите коротко решение