In [1]:
import math
import operator

import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier

from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

import datetime

In [2]:
def one_hot_encode(df,col):
    one_hot = pd.get_dummies(df[col])
    one_hot.columns = [(col+'_'+str(s)) for s in one_hot.columns]
    df = df.drop(col,axis = 1)
    df = df.join(one_hot)
    return df

In [3]:

#proba для месяца
def get_test_proba(df, churn_month, path):
    last_month = lambda x: pd.datetime(x.year, x.month, 1)

    df_proba = df.groupby('id')[['date','first_prch']].agg(['max','min']).copy()

    # колонка с максимальным месяцем
    df_proba['date']['max'] = df_proba['date']['max'].apply(last_month)
    df_proba['date']['min'] = df_proba['date']['min'].apply(last_month)


    # откидываем людей, которые откинулись до искомого месяца
    df_proba = df_proba[df_proba['date']['max'] >= pd.datetime(2017, churn_month-1, 1)]
    df_proba = df_proba[df_proba['date']['min'] <= pd.datetime(2017, churn_month-1, 1)]

    df_proba['proba'] = df_proba['date']['max'].apply(lambda x: 1 if x == pd.datetime(2017, churn_month-1, 1) else 0)
    df_proba['age'] = df_proba['date']['max'] - df_proba['first_prch']['max']
    df_proba['age'] = df_proba['age'].apply(lambda x: int(x.days/30))

    #df_proba = df_proba.drop(['date']['max','min'],axis = 1)
    print(df_proba.proba.value_counts())
    #df_proba = df_proba.drop([['date']['max']])
    df_proba.to_csv(path)
    return df_proba[['proba','age']]




In [4]:
def parse_date(date_to_parse):
    date = date_to_parse.split('-')
    return datetime.datetime(int(date[0]),int(date[1]),int(date[2]))


In [47]:

def score(classifier, X, y):
    print(cross_val_score(classifier, X, y, scoring='roc_auc').mean())



In [5]:
import datetime

def prepare_mega_data_set(sorce='train_data',month=10,suffix='base'):
    #грузим датасет
    df = pd.read_csv(sorce+'.csv',parse_dates=['first_prch'])

    df.date = df.date.apply(parse_date)

    df = df.fillna(0)
    df = df.drop('Unnamed: 0',axis=1)
    df = df.drop('n_tr',axis=1)

    df_proba = get_test_proba(df,month,sorce+'_proba_'+str(month)+suffix+'.csv')
    df = df[df['date'] < datetime.datetime(2017,month,1)]
    
    data = df

    data['datetime'] = data['date']
    data['year_month'] = data['datetime'].apply(lambda a: a.year*100+a.month)

    data_grups = data.groupby(['id','date']).sum()
    data_grups.reset_index(inplace=True)

    data_grups['datetime'] = data_grups['date']
    data_grups['month'] = data_grups['datetime'].apply(lambda a: a.month)
    data_grups['year_month'] = data_grups['datetime'].apply(lambda a: a.year*100+a.month)

    data_grups['q_yes'] = data_grups['q'].apply(lambda a: 1 if a > 0 else 0)

    data_by_cols = []

    data_by_month_visit_count = data_grups.groupby(['id','year_month'])['datetime'].count()
    data_by_month_q_count = data_grups.groupby(['id','year_month'])['q_yes'].sum()

    data_by_cols.append(data_by_month_visit_count.to_frame())
    data_by_cols.append(data_by_month_q_count.to_frame())

    data_by_month_avg = data_grups.groupby(['id','year_month'])['v_l','sum_b','percent'].mean()
    data_by_cols.append(data_by_month_avg)

    cout_cols = ['code_azs','location','region','code1','type']
    for col in cout_cols:
        data_by_tmp = data_grups.groupby(['id','year_month','code_azs']).count()
        data_by_tmp.reset_index(inplace=True)
        data_by_tmp = data_grups.groupby(['id','year_month']).count()
        data_by_tmp = data_by_tmp['q']
        data_by_cols.append(data_by_tmp.to_frame())

    dataset = data_by_cols[0].copy()
    dataset = dataset.unstack()
    index = dataset.index.get_level_values(0).tolist()
    dataset = dataset.reset_index(drop=True)
    dataset.index = index

    for i in range(len(data_by_cols)-1):
        df = data_by_cols[i].copy()
        df = df.unstack()
        index = df.index.get_level_values(0).tolist()
        df = df.reset_index(drop=True)
        df.index = index
        suf = '_'+str(i)
        dataset = dataset.join(df,rsuffix=suf)

    proba = pd.read_csv(sorce+'_proba_'+str(month)+'_stack.csv')
    proba = proba[2:]
    proba.index = proba['Unnamed: 0']
    y = proba['proba']
    proba = proba.drop(['proba','Unnamed: 0','date','date.1','first_prch','first_prch.1'],axis=1)
    proba.head(5)

    mega_set = dataset.join(proba)

    mega_set = (mega_set - mega_set.mean()) / (mega_set.max() - mega_set.min())

    mega_set = mega_set.fillna(0)

    mega_set = mega_set.join(y)

    mega_set.to_csv(sorce+'_fr_'+str(month)+suffix+'.csv')
    print(mega_set.head())
    return mega_set


In [6]:
# prepare_mega_data_set(sorce='train_data_100000', month=10, suffix='wide')
train_data_10 = prepare_mega_data_set(sorce='train_data', month=10, suffix='wide')
# train_data_12 = prepare_mega_data_set(sorce='train_data', month=12, suffix='wide')
# # #
# test_data_10 = prepare_mega_data_set(sorce='test_data', month=10, suffix='wide')
# test_data_10 = prepare_mega_data_set(sorce='test_data', month=12, suffix='wide')

  if self.run_code(code, result):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


0    58750
1      852
Name: proba, dtype: int64




              (datetime, 201601)  (datetime, 201602)  (datetime, 201603)  \
21-186G-1161            0.000000            0.000000           -0.062842   
21-186G-1172            0.000000            0.000000            0.000000   
21-186G-1190           -0.064999           -0.045227            0.000000   
21-186G-1193            0.015001           -0.009513           -0.025805   
21-186G-12              0.000000           -0.045227            0.011232   

              (datetime, 201604)  (datetime, 201605)  (datetime, 201606)  \
21-186G-1161           -0.075170            0.000000            0.000000   
21-186G-1172            0.000000            0.000000            0.000000   
21-186G-1190            0.000000            0.000000           -0.116307   
21-186G-1193           -0.075170           -0.117652           -0.116307   
21-186G-12             -0.038133            0.025205           -0.116307   

              (datetime, 201607)  (datetime, 201608)  (datetime, 201609)  \
21-186G-11

In [36]:
# def get_stack():
train_10 = pd.read_csv('train_data_fr_10wide.csv', index_col='Unnamed: 0')
train_12 = pd.read_csv('train_data_fr_12wide.csv', index_col='Unnamed: 0')
test_10 = pd.read_csv('test_data_fr_10wide.csv', index_col='Unnamed: 0')
test_12 = pd.read_csv('test_data_fr_12wide.csv', index_col='Unnamed: 0')
#     return (train_10, train_12, test_10, test_12)




In [128]:
import numpy as np

def base_rf(train_10, train_12, test_10, test_12, submition_name, rf_10, rf_12):
    train_cols = train_10.columns.tolist()

    churn_col = train_cols[len(train_cols) - 1]
    
    train_10 = train_10[np.isfinite(train_10['proba'])]
    train_12 = train_12[np.isfinite(train_12['proba'])]
    
    train_10 = train_10[~train_10.index.duplicated(keep='first')]
    train_12 = train_12[~train_12.index.duplicated(keep='first')]
    
#     idxes = np.intersect1d(train_10.index.unique(), train_12.index.unique())
#     train_12 = train_12[~np.setdiff1d(train_12.index.unique(), idxes)]
    
    test_10 = test_10[test_10[churn_col] == 1]

    X_train_10, y_train_10 = get_X_y(train_10)
    #X_train_10_balanced, y_train_10_balanced = sm.fit_sample(X_train_10, y_train_10)

    X_train_12, y_train_12 = get_X_y(train_12)
    #X_train_12_balanced, y_train_12_balanced = sm.fit_sample(X_train_12, y_train_12)

    X_test_10, y_test_10 = get_X_y(test_10)
    X_test_12, y_test_12 = get_X_y(test_12)

    
    rf_10.fit(X_train_10, y_train_10)
    rf_12.fit(X_train_12, y_train_12)
    
    score(rf_10,X_train_10, y_train_10)
    score(rf_12, X_train_12, y_train_12)

    predict_proba_10 = rf_10.predict_proba(X_test_10)[:, 1]
    predict_proba_12 = rf_12.predict_proba(X_test_12)[:, 1]

    preds_10 = pd.Series(data=predict_proba_10, index=y_test_10.index)
    preds_12 = pd.Series(data=predict_proba_12, index=y_test_12.index)


    preds = preds_10.append(preds_12)
    preds = preds[~preds.index.duplicated(keep='last')]
    print(preds.head())

    preds.to_csv(submition_name + '_wide.csv')

    return rf_10, rf_12, preds

In [56]:
def get_X_y(train_10):
    train_cols = train_10.columns.tolist()
    churn_col = train_cols[len(train_cols) - 1]

    y_train_10 = train_10[churn_col]
    X_train_10 = train_10.drop(churn_col, axis=1)
    print(y_train_10.value_counts())

    return (X_train_10, y_train_10)

In [66]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 7, shuffle = True)

# rf_10 = CalibratedClassifierCV(RandomForestClassifier(n_jobs=-1), method='isotonic', cv=skf)
# rf_12 = CalibratedClassifierCV(RandomForestClassifier(n_jobs=-1), method='isotonic', cv=skf)

rf_10 = RandomForestClassifier(n_jobs=-1,n_estimators=200)
rf_12 = RandomForestClassifier(n_jobs=-1,n_estimators=200)


In [67]:
rf_10, rf_12, rf_result = base_rf(train_10, train_12, test_10, test_12, 'submition_stack_rf_cluser',rf_10,rf_12)

0.0    58750
1.0      852
Name: proba, dtype: int64
0.0    49729
1.0     7177
Name: proba, dtype: int64
1.0    15815
Name: proba, dtype: int64
1.0    15307
Name: proba, dtype: int64
0.9486571574163728
0.8749280657742466
21-186G-1225    0.015
21-186G-1232    0.050
21-186G-1258    0.030
21-186G-1287    0.140
21-186G-1344    0.075
dtype: float64


In [62]:
rf_result.head()

21-186G-1225    0.010039
21-186G-1232    0.053786
21-186G-1258    0.023136
21-186G-1287    0.130010
21-186G-1344    0.024369
dtype: float64

In [69]:
    train_cols = train_10.columns.tolist()

    churn_col = train_cols[len(train_cols) - 1]
    
    train_10 = train_10[np.isfinite(train_10['proba'])]
    train_12 = train_12[np.isfinite(train_12['proba'])]

    test_10 = test_10[test_10[churn_col] == 1]

    X_train_10, y_train_10 = get_X_y(train_10)
    #X_train_10_balanced, y_train_10_balanced = sm.fit_sample(X_train_10, y_train_10)

    X_train_12, y_train_12 = get_X_y(train_12)
    #X_train_12_balanced, y_train_12_balanced = sm.fit_sample(X_train_12, y_train_12)

0.0    58750
1.0      852
Name: proba, dtype: int64
0.0    49729
1.0     7177
Name: proba, dtype: int64


In [84]:
# Get numerical feature importances
importances_10 = list(rf_10.feature_importances_)

# List of tuples with variable and importance
feature_importances_10 = [(feature, round(importance_10, 2)) for feature, importance_10 in zip(X_train_10.columns, importances_10)]

# Sort the feature importances by most important first
feature_importances_10 = sorted(feature_importances_10, key = lambda x: x[1], reverse = True)

feature_importances_10 = list(feature_importances_10[:50])
feature_importances_10 = [i[0] for i in feature_importances_10]
feature_importances_10

['age',
 "('v_l', 201709)",
 "('sum_b', 201709)",
 "('datetime', 201708)",
 "('datetime', 201709)",
 "('datetime_0', 201709)",
 "('v_l', 201608)",
 "('v_l', 201609)",
 "('v_l', 201610)",
 "('v_l', 201611)",
 "('v_l', 201612)",
 "('v_l', 201701)",
 "('v_l', 201702)",
 "('v_l', 201703)",
 "('v_l', 201704)",
 "('v_l', 201705)",
 "('v_l', 201706)",
 "('v_l', 201707)",
 "('v_l', 201708)",
 "('sum_b', 201605)",
 "('sum_b', 201607)",
 "('sum_b', 201608)",
 "('sum_b', 201609)",
 "('sum_b', 201610)",
 "('sum_b', 201611)",
 "('sum_b', 201612)",
 "('sum_b', 201701)",
 "('sum_b', 201702)",
 "('sum_b', 201703)",
 "('sum_b', 201704)",
 "('sum_b', 201705)",
 "('sum_b', 201706)",
 "('sum_b', 201707)",
 "('sum_b', 201708)",
 "('percent', 201605)",
 "('percent', 201606)",
 "('percent', 201607)",
 "('percent', 201701)",
 "('percent', 201702)",
 "('percent', 201703)",
 "('percent', 201705)",
 "('percent', 201707)",
 "('percent', 201708)",
 "('percent', 201709)",
 "('q', 201709)",
 "('q_4', 201709)",
 "('q

In [85]:
# Get numerical feature importances
importances_12 = list(rf_12.feature_importances_)

# List of tuples with variable and importance
feature_importances_12 = [(feature, round(importance_12, 2)) for feature, importance_12 in zip(X_train_12.columns, importances_12)]

# Sort the feature importances by most important first
feature_importances_12 = sorted(feature_importances_12, key = lambda x: x[1], reverse = True)

feature_importances_12 = list(feature_importances_12[:50])
feature_importances_12 = [i[0] for i in feature_importances_12]
feature_importances_12

['age',
 "('datetime', 201711)",
 "('datetime_0', 201711)",
 "('q', 201711)",
 "('q_5', 201711)",
 "('q_6', 201711)",
 "('datetime', 201709)",
 "('datetime', 201710)",
 "('datetime_0', 201709)",
 "('datetime_0', 201710)",
 "('v_l', 201610)",
 "('v_l', 201612)",
 "('v_l', 201701)",
 "('v_l', 201702)",
 "('v_l', 201703)",
 "('v_l', 201704)",
 "('v_l', 201705)",
 "('v_l', 201706)",
 "('v_l', 201707)",
 "('v_l', 201708)",
 "('v_l', 201709)",
 "('v_l', 201710)",
 "('v_l', 201711)",
 "('sum_b', 201610)",
 "('sum_b', 201611)",
 "('sum_b', 201612)",
 "('sum_b', 201701)",
 "('sum_b', 201702)",
 "('sum_b', 201703)",
 "('sum_b', 201704)",
 "('sum_b', 201705)",
 "('sum_b', 201706)",
 "('sum_b', 201707)",
 "('sum_b', 201708)",
 "('sum_b', 201709)",
 "('sum_b', 201710)",
 "('sum_b', 201711)",
 "('percent', 201711)",
 "('q', 201709)",
 "('q', 201710)",
 "('q_4', 201709)",
 "('q_4', 201710)",
 "('q_4', 201711)",
 "('q_5', 201709)",
 "('q_5', 201710)",
 "('q_6', 201709)",
 "('q_6', 201710)",
 "('dateti

In [129]:
rf_10 = CalibratedClassifierCV(RandomForestClassifier(n_jobs=-1), method='isotonic', cv=skf)
rf_12 = CalibratedClassifierCV(RandomForestClassifier(n_jobs=-1), method='isotonic', cv=skf)

train_10 = train_10[feature_importances_10+['proba']]
train_12 = train_12[feature_importances_12+['proba']]
test_10 = test_10[feature_importances_10+['proba']]
test_12 = test_12[feature_importances_12+['proba']]

rf_10, rf_12, rf_result = base_rf(train_10, train_12, test_10, test_12, 'submition_stack_rf_cluser',rf_10,rf_12)

0.0    58750
1.0      852
Name: proba, dtype: int64
0.0    49729
1.0     7177
Name: proba, dtype: int64
1.0    15815
Name: proba, dtype: int64
1.0    15307
Name: proba, dtype: int64
0.944179402806657
0.8827602774346787
21-186G-1142    0.536202
21-186G-1225    0.028298
21-186G-1227    0.294082
21-186G-1232    0.020100
21-186G-1258    0.020100
dtype: float64


In [103]:
rf_result

21-186G-1225    0.009688
21-186G-1232    0.008956
21-186G-1258    0.002937
21-186G-1287    0.114615
21-186G-1344    0.009060
21-186G-1368    0.002937
21-186G-1455    0.002937
21-186G-15      0.002937
21-186G-1524    0.002937
21-186G-1547    0.002937
21-186G-1574    0.061428
21-186G-1735    0.177681
21-186G-1764    0.021798
21-186G-1792    0.034340
21-186G-1815    0.015151
21-186G-1863    0.002937
21-186G-1881    0.013133
21-186G-1933    0.009688
21-186G-196     0.055401
21-186G-2034    0.102293
21-186G-2062    0.002937
21-186G-2132    0.020524
21-186G-2146    0.079917
21-186G-2227    0.002937
21-186G-2236    0.128295
21-186G-2237    0.066839
21-186G-233     0.002937
21-186G-2345    0.025724
21-186G-2369    0.002937
21-186G-2370    0.002937
                  ...   
21-8B2-7392     0.074696
21-8B2-7393     0.045193
21-8B2-74       0.084983
21-8B2-7405     0.043299
21-8B2-7409     0.084776
21-8B2-7429     0.302188
21-8B2-7435     0.020715
21-8B2-7455     0.020715
21-8B2-7485     0.051387


In [100]:
idx = rf_result.index.unique()
asd = rf_result.loc[idx]
asd
asd.to_csv('ololo.csv')


In [121]:
#     train_10 = train_10.loc(train_10.index.unique)
#     train_12 = train_12.loc(train_12.index.unique)

#     idxes = np.intersect1d(train_10.index.tolist(), train_12.index.tolist())
#     train_12 = train_12.loc(np.setdiff1d(train_12.index.tolist(), idxes))



Unnamed: 0,age,"('v_l', 201709)","('sum_b', 201709)","('datetime', 201708)","('datetime', 201709)","('datetime_0', 201709)","('v_l', 201608)","('v_l', 201609)","('v_l', 201610)","('v_l', 201611)",...,"('percent', 201707)","('percent', 201708)","('percent', 201709)","('q', 201709)","('q_4', 201709)","('q_5', 201708)","('q_5', 201709)","('q_6', 201709)","('datetime', 201601)",proba
21-186G-1161,0.000289,0.000000,0.000000,-0.105592,0.000000,0.000000,0.000000,-0.052696,0.042289,0.011781,...,0.000000,-0.009426,0.000000,0.000000,0.000000,-0.105592,0.000000,0.000000,0.000000,0.0
21-186G-1172,0.097063,-0.027584,-0.029807,0.027742,0.040685,0.040685,0.000000,0.000000,0.000000,0.000000,...,-0.012914,-0.009409,-0.010054,0.040685,0.040685,0.027742,0.040685,0.040685,0.000000,0.0
21-186G-1190,0.097063,-0.028284,-0.030440,-0.005592,-0.030744,-0.030744,0.000000,0.000000,-0.020089,-0.022587,...,-0.012918,-0.008239,0.036104,-0.030744,-0.030744,-0.005592,-0.030744,-0.030744,-0.064999,0.0
21-186G-1193,0.032547,0.000000,0.000000,0.000000,0.000000,0.000000,-0.040872,-0.012579,-0.022826,-0.020051,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015001,0.0
21-186G-12,-0.419066,-0.005108,-0.007894,0.194408,0.040685,0.040685,0.008418,0.023148,0.001591,0.002427,...,-0.012918,-0.009419,-0.010053,0.040685,0.040685,0.194408,0.040685,0.040685,0.000000,0.0
21-186G-1201,0.097063,0.007974,0.012833,0.027742,-0.030744,-0.030744,-0.010887,-0.023370,-0.016817,-0.010869,...,-0.012918,-0.004114,-0.010078,-0.030744,-0.030744,0.027742,-0.030744,-0.030744,0.000000,0.0
21-186G-1215,0.097063,0.000000,0.000000,-0.072258,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.012918,-0.009426,0.000000,0.000000,0.000000,-0.072258,0.000000,0.000000,0.000000,0.0
21-186G-122,0.097063,-0.017765,-0.013934,-0.072258,0.076399,0.076399,0.000000,0.000000,0.000000,0.000000,...,-0.012918,-0.009426,-0.010078,0.076399,0.076399,-0.072258,0.076399,0.076399,0.000000,0.0
21-186G-123,0.064805,0.044352,0.043942,0.061075,0.004971,0.004971,-0.032900,-0.035024,-0.021080,-0.013618,...,-0.012918,-0.009426,-0.010078,0.004971,0.004971,0.061075,0.004971,0.004971,-0.024999,0.0
21-186G-1235,0.097063,-0.004670,-0.004076,-0.038925,-0.066458,-0.066458,0.000000,0.017296,0.018199,-0.001929,...,-0.012918,-0.009426,-0.010078,-0.066458,-0.066458,-0.038925,-0.066458,-0.066458,0.000000,0.0


In [124]:
# idxes = np.intersect1d(train_10.index.unique(), train_12.index.unique())
# idxes
# np.setdiff1d(train_12.index.unique(), idxes)


array(['21-186G-1775', '21-186G-1780', '21-1870-3192', '21-1886-1707',
       '21-1886-4787', '21-188M-303', '21-188M-547', '21-19DP-1682',
       '21-19DP-3161', '21-19DY-791', '21-8A1-16899', '21-8A1-5378',
       '21-8A1-6274', '21-8AC-19177', '21-8AC-6189', '21-8AE-4429',
       '21-8AF-17222', '21-8AF-7819', '21-8AF-885', '21-8AF-8850',
       '21-8AG-18431', '21-8AG-2232', '21-8AG-2233', '21-8AH-11914',
       '21-8AH-15060', '21-8AH-8327', '21-8AJ-11454', '21-8AJ-1299',
       '21-8AJ-14157', '21-8AJ-7495', '21-8AJ-9081', '21-8AN-1944',
       '21-8AN-4454', '21-8AN-6425', '21-8AN-8534', '21-8B0-10719',
       '21-8B0-10843', '21-8B0-159', '21-8B0-2014', '21-8B0-2018',
       '21-8B0-2019', '21-8B0-202', '21-8B0-3040', '21-8B0-6801',
       '21-8B1-19617', '21-8B1-276', '21-8B1-5695', '21-8B1-5696',
       '21-8B1-5702', '21-8B1-5705', '21-8B1-5713', '21-8B1-5717',
       '21-8B1-572', '21-8B1-5720', '21-8B1-6290', '21-8B1-6755',
       '21-8B1-6973', '21-8B1-7079', '21-8B1-7107