In [1]:
import numpy as np
import pandas as pd
import pickle   # сохранение модели
import seaborn as sns

from datetime import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, roc_auc_score
from matplotlib import pyplot as plt

import xgboost as xgb
import lightgbm as lgb
import catboost as catb

import gc
import dask.dataframe as dd

import warnings
warnings.filterwarnings('ignore')

In [39]:
test = pd.read_csv('../cour_proj/data_test.csv', index_col=0)
train = pd.read_csv('../cour_proj/data_train.csv', index_col=0)

In [40]:
features_df = dd.read_csv('../cour_proj/features.csv', sep='\t').set_index('Unnamed: 0')

### перевод Dask dataframe в Pandas dataframe 
### и сразу сортирую по 'buy_time' (для последующего соединения с test, train)

feat_df = features_df.compute().sort_values('buy_time')

уменьшение объема памяти, который занимает датасет

In [41]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
 
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [42]:
reduce_mem_usage(feat_df)

Memory usage of dataframe is 8813.53 MB
Memory usage after optimization is: 4423.98 MB
Decreased by 49.8%


Unnamed: 0_level_0,id,buy_time,0,1,2,3,4,5,6,7,...,243,244,245,246,247,248,249,250,251,252
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4278794,1735316,1531083600,-96.799973,32.500889,-110.740784,-20.106798,-116.158249,-41.211788,-16.08618,375.603912,...,-730.373840,-605.770813,-25.996269,-22.630449,-254.747726,-13.832889,-0.694429,-0.175933,-0.45614,0.0
1845030,2206020,1531083600,-96.799973,-138.729111,-110.740784,-191.336792,-116.158249,-212.441788,-16.08618,-65.076096,...,978.626160,1341.229248,-25.996269,-37.630447,-168.747726,37.167110,-0.694429,50.824066,-0.45614,1.0
1415879,1205398,1531083600,-17.469971,-328.849121,-31.410786,-381.456787,-36.828247,-402.561798,-16.08618,-65.076096,...,-756.373840,-600.770813,-25.996269,-34.630447,1522.252319,-11.832889,-0.694429,1.824067,-0.45614,0.0
3728865,3588725,1531083600,-81.539970,749.460876,5.389214,1028.283203,-0.028246,1007.178223,-16.08618,-65.076096,...,8282.625977,151.229202,-21.996269,4.369552,-25.747725,-21.832888,2.305572,-11.175933,-0.45614,0.0
2862416,2595799,1531083600,72.690033,-48.859112,58.749214,-101.466797,53.331753,-122.571793,-16.08618,-65.076096,...,-410.373840,-611.770813,-19.996269,-37.630447,-113.747726,43.167110,-0.694429,56.824066,-0.45614,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2536723,4244058,1548018000,-82.549973,-27.569113,-96.490784,-80.176796,-101.908249,-101.281792,-7.83618,-56.826096,...,-964.373840,-602.770813,-25.996269,-37.630447,-298.747711,-22.832888,-0.694429,-12.175933,-0.45614,0.0
2536704,4237932,1548018000,-84.349968,12.880888,-98.290787,-39.726799,-103.708244,-60.831791,-16.08618,309.923889,...,-900.373840,-605.770813,-23.996269,-37.630447,-97.747726,-22.832888,-0.694429,-12.175933,-0.45614,0.0
660370,1390311,1548018000,-33.019970,-344.399109,-44.460785,-379.386810,-46.878246,-99.071793,-16.08618,-65.076096,...,-977.373840,-613.770813,-25.996269,-21.630449,-293.747711,-22.832888,-0.694429,-12.175933,-0.45614,0.0
660417,1398944,1548018000,-96.799973,-408.179108,-110.740784,-460.786804,-116.158249,-481.891785,-16.08618,-65.076096,...,-977.373840,-613.770813,-25.996269,-37.630447,-306.747711,-25.832888,-0.694429,-12.175933,-0.45614,0.0


In [43]:
reduce_mem_usage(train)

Memory usage of dataframe is 31.73 MB
Memory usage after optimization is: 19.04 MB
Decreased by 40.0%


Unnamed: 0,id,vas_id,buy_time,target
0,540968,8.0,1537131600,0.0
1,1454121,4.0,1531688400,0.0
2,2458816,1.0,1534107600,0.0
3,3535012,5.0,1535922000,0.0
4,1693214,1.0,1535922000,0.0
...,...,...,...,...
831648,3812226,2.0,1546203600,0.0
831649,2480469,2.0,1546203600,0.0
831650,158236,2.0,1546203600,0.0
831651,1825525,2.0,1546203600,0.0


In [44]:
reduce_mem_usage(test)

Memory usage of dataframe is 2.17 MB
Memory usage after optimization is: 1.36 MB
Decreased by 37.5%


Unnamed: 0,id,vas_id,buy_time
0,3130519,2.0,1548018000
1,2000860,4.0,1548018000
2,1099444,2.0,1546808400
3,1343255,5.0,1547413200
4,1277040,2.0,1546808400
...,...,...,...
71226,2502453,5.0,1548018000
71227,1693213,2.0,1548018000
71228,1891350,2.0,1548018000
71229,2437172,2.0,1548018000


соединение датасетов по столбцам: "id", "buy_time"

In [45]:
train_df = pd.merge_asof(train.sort_values('buy_time'), feat_df, on='buy_time', by='id', direction='nearest')
test_df = pd.merge_asof(test.sort_values('buy_time'), feat_df, on='buy_time', by='id', direction='nearest')

In [46]:
del feat_df

gc.collect()

190

In [47]:
train_df['buy_time'] = train_df['buy_time'].map(dt.fromtimestamp)
test_df['buy_time'] = test_df['buy_time'].map(dt.fromtimestamp)  

In [48]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 831653 entries, 0 to 831652
Columns: 257 entries, id to 252
dtypes: datetime64[ns](1), float32(255), int32(1)
memory usage: 824.9 MB


In [49]:
train_df['buy_time'] = train_df['buy_time'].map(dt.toordinal)
test_df['buy_time'] = test_df['buy_time'].map(dt.toordinal)

In [50]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 831653 entries, 0 to 831652
Columns: 257 entries, id to 252
dtypes: float32(255), int32(1), int64(1)
memory usage: 824.9 MB


In [51]:
train_df['target'].value_counts()

0.0    771467
1.0     60186
Name: target, dtype: int64

In [52]:
train_df.describe()

Unnamed: 0,id,vas_id,buy_time,target,0,1,2,3,4,5,...,243,244,245,246,247,248,249,250,251,252
count,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,...,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0
mean,2158304.0,2.686185,736974.891421,0.072369,0.301304,1.801172,0.237123,2.066911,0.262526,2.270044,...,0.548182,0.861794,0.376409,-0.060826,-1.191119,0.017,-0.032236,0.205423,0.006831,0.258348
std,1257907.0,1.952034,58.090889,0.259098,299.521179,491.218567,306.206848,555.532898,318.280334,611.955933,...,3188.381104,2229.624756,1362.344971,300.972656,1293.871948,265.417084,7.032948,204.421036,11.730679,2.095543
min,2.0,1.0,736884.0,0.0,-1151.890015,-1594.119141,-2651.84082,-1634.436768,-2657.258301,-1655.541748,...,-977.37384,-613.770813,-25.996269,-37.630447,-306.747711,-25.832888,-0.694429,-12.175933,-0.45614,0.0
25%,1067319.0,1.0,736919.0,0.0,-96.799973,-208.339111,-110.740784,-242.146805,-116.158249,-259.351776,...,-977.37384,-613.770813,-25.996269,-37.630447,-305.747711,-25.832888,-0.694429,-12.175933,-0.45614,0.0
50%,2144574.0,2.0,736968.0,0.0,-84.939972,-59.029114,-85.300789,-77.866798,-87.608246,-92.051788,...,-958.37384,-613.770813,-25.996269,-37.630447,-239.747726,-23.832888,-0.694429,-12.175933,-0.45614,0.0
75%,3244254.0,4.0,737038.0,0.0,11.850029,120.620888,27.439215,128.7332,29.161755,122.29821,...,-482.37384,-405.770782,-24.996269,-32.630447,-77.747726,-10.832889,-0.694429,-9.175933,-0.45614,1.0
max,4362694.0,9.0,737059.0,1.0,211730.71875,212606.125,211716.78125,213357.75,212095.265625,213720.546875,...,173834.625,145602.234375,980158.0,47086.371094,185850.25,138539.171875,2285.305664,138552.828125,9489.543945,1743.0


In [53]:
train_df['vas_id'].value_counts()

1.0    310175
2.0    249505
5.0     94085
4.0     85756
6.0     57878
7.0     15432
8.0     13350
9.0      5472
Name: vas_id, dtype: int64

In [54]:
table = pd.pivot_table(train_df, 
                       values='id', 
                       index=['vas_id'], 
                       columns=['target'], 
                       aggfunc=[len],
                       margins=True)
table

Unnamed: 0_level_0,len,len,len
target,0.0,1.0,All
vas_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1.0,304511,5664,310175
2.0,244708,4797,249505
4.0,63991,21765,85756
5.0,92393,1692,94085
6.0,33174,24704,57878
7.0,15219,213,15432
8.0,13003,347,13350
9.0,4468,1004,5472
All,771467,60186,831653


Услуга под номером 1 - чаще всего предлагаемая, но нитак охотно подключаемая абонентами (приблизительно 2%), как, например, услуга № 6 (процент подключения примерно 43%) или услуга № 4 (подключаемость примерно 25%). 

###### разделим данные на Train и test

In [55]:

X = train_df.drop(columns='target')
y = train_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=23)

### LGBMClassifier

In [56]:
def level_threshold(y, pred_prob):
    step_factor = 0.05
    threshold_value = 0.2
    roc_score = 0
    while threshold_value <= 0.8:
        temp_thresh = threshold_value
        predicted = (pred_prob [:,1] >=temp_thresh).astype('int')
        print('Threshold', temp_thresh,'--',roc_auc_score(y, predicted))
        if roc_score<roc_auc_score(y, predicted):
            roc_score = roc_auc_score(y, predicted)
            thrsh_score = threshold_value
        threshold_value = threshold_value + step_factor
    print('---Optimum Threshold ----',thrsh_score,'--ROC--',roc_score)
    
    return thrsh_score

In [57]:
%%time

model_lgbm = lgb.LGBMClassifier(num_iterations=7,
                                max_depth=9,
                                random_state=42).fit(X_train, y_train)

train_pred_prob = model_lgbm.predict_proba(X_train)
test_pred_prob = model_lgbm.predict_proba(X_test)

preds = np.where(model_lgbm.predict_proba(X_train)[:,1] > level_threshold(y_train, train_pred_prob), 1, 0)
preds_test = np.where(model_lgbm.predict_proba(X_test)[:,1] > level_threshold(y_test, test_pred_prob), 1, 0)

Threshold 0.2 -- 0.7978483821548399
Threshold 0.25 -- 0.7519751185985772
Threshold 0.3 -- 0.6489695729870707
Threshold 0.35 -- 0.6037584005495795
Threshold 0.39999999999999997 -- 0.5843437357849007
Threshold 0.44999999999999996 -- 0.5609196040047184
Threshold 0.49999999999999994 -- 0.548882738577327
Threshold 0.5499999999999999 -- 0.5126263195654553
Threshold 0.6 -- 0.5
Threshold 0.65 -- 0.5
Threshold 0.7000000000000001 -- 0.5
Threshold 0.7500000000000001 -- 0.5
---Optimum Threshold ---- 0.2 --ROC-- 0.7978483821548399
Threshold 0.2 -- 0.8004355477593244
Threshold 0.25 -- 0.7550296018688152
Threshold 0.3 -- 0.6501909884870015
Threshold 0.35 -- 0.6021517867562227
Threshold 0.39999999999999997 -- 0.5824929646934348
Threshold 0.44999999999999996 -- 0.5583385634156949
Threshold 0.49999999999999994 -- 0.5476572343247665
Threshold 0.5499999999999999 -- 0.5115772581814564
Threshold 0.6 -- 0.5
Threshold 0.65 -- 0.5
Threshold 0.7000000000000001 -- 0.5
Threshold 0.7500000000000001 -- 0.5
---Optim

In [58]:
f1_score_train = f1_score(y_train, preds, average='macro')
f1_score_test = f1_score(y_test, preds_test, average='macro')

print('f1_score for TRAIN:', f1_score_train)
print('f1_score for TEST:', f1_score_test)

f1_score for TRAIN: 0.7606644420848714
f1_score for TEST: 0.7611500196421568


In [59]:
results = pd.DataFrame({'model': ['LGBMClassifier'],
                       'f1_score_(average=macro)': [f1_score_test]})

In [60]:
target_names = ['0', '1']

print('-------TRAIN-----')
print(classification_report(y_train, preds, target_names=target_names))
print('---------TEST----------')
print(classification_report(y_test, preds_test, target_names=target_names))

-------TRAIN-----
              precision    recall  f1-score   support

           0       0.97      0.95      0.96    516936
           1       0.50      0.65      0.56     40271

    accuracy                           0.93    557207
   macro avg       0.73      0.80      0.76    557207
weighted avg       0.94      0.93      0.93    557207

---------TEST----------
              precision    recall  f1-score   support

           0       0.97      0.95      0.96    254531
           1       0.49      0.65      0.56     19915

    accuracy                           0.93    274446
   macro avg       0.73      0.80      0.76    274446
weighted avg       0.94      0.93      0.93    274446



### XGBClassifier

In [61]:
%%time

model_xgb = xgb.XGBClassifier(n_estimators=13, 
                              max_depth = 2,
                              random_state=10).fit(X_train, y_train)

train_pred_prob = model_xgb.predict_proba(X_train)
test_pred_prob = model_xgb.predict_proba(X_test)

preds_xgb = np.where(model_xgb.predict_proba(X_train)[:,1] > level_threshold(y_train, train_pred_prob), 1, 0)
preds_test_xgb = np.where(model_xgb.predict_proba(X_test)[:,1] > level_threshold(y_test, test_pred_prob), 1, 0)

Threshold 0.2 -- 0.8238632167173026
Threshold 0.25 -- 0.779373796800321
Threshold 0.3 -- 0.7355682080696155
Threshold 0.35 -- 0.5793706481414223
Threshold 0.39999999999999997 -- 0.5793706481414223
Threshold 0.44999999999999996 -- 0.5698949530536938
Threshold 0.49999999999999994 -- 0.5590996758659788
Threshold 0.5499999999999999 -- 0.5590996758659788
Threshold 0.6 -- 0.5590996758659788
Threshold 0.65 -- 0.5590996758659788
Threshold 0.7000000000000001 -- 0.5126263195654553
Threshold 0.7500000000000001 -- 0.5126263195654553
---Optimum Threshold ---- 0.2 --ROC-- 0.8238632167173026
Threshold 0.2 -- 0.8279177022162997
Threshold 0.25 -- 0.7837318828727654
Threshold 0.3 -- 0.7371542540599004
Threshold 0.35 -- 0.5772000520660462
Threshold 0.39999999999999997 -- 0.5772000520660462
Threshold 0.44999999999999996 -- 0.5674943314315853
Threshold 0.49999999999999994 -- 0.5563375089106721
Threshold 0.5499999999999999 -- 0.5563375089106721
Threshold 0.6 -- 0.5563375089106721
Threshold 0.65 -- 0.5563375

In [62]:
print('-------TRAIN-----')
print(classification_report(y_train, preds_xgb, target_names=target_names))
print('---------TEST----------')
print(classification_report(y_test, preds_test_xgb, target_names=target_names))

-------TRAIN-----
              precision    recall  f1-score   support

           0       0.98      0.93      0.95    516936
           1       0.45      0.72      0.55     40271

    accuracy                           0.92    557207
   macro avg       0.71      0.82      0.75    557207
weighted avg       0.94      0.92      0.93    557207

---------TEST----------
              precision    recall  f1-score   support

           0       0.98      0.93      0.95    254531
           1       0.45      0.72      0.56     19915

    accuracy                           0.92    274446
   macro avg       0.72      0.83      0.76    274446
weighted avg       0.94      0.92      0.93    274446



In [63]:
f1_score_train = f1_score(y_train, preds_xgb, average='macro')
f1_score_test = f1_score(y_test, preds_test_xgb, average='macro')

print('f1_score for TRAIN:', f1_score_train)
print('f1_score for TEST:', f1_score_test)

f1_score for TRAIN: 0.7541605267280536
f1_score for TEST: 0.7563028986197691


In [64]:
results = results.append({'model': 'XGBClassifier',
                       'f1_score_(average=macro)': round(f1_score_test, 5)}, ignore_index=True)

### CatBoostClassifier

In [65]:
%%time


model_catb = catb.CatBoostClassifier(n_estimators=55,
                                     max_depth=7,
                                     objective='CrossEntropy',
                                     bootstrap_type='Bernoulli',
                                     reg_lambda=10,
                                     min_data_in_leaf=11,
                                     silent=True, 
                                     random_state=2).fit(X_train, y_train)

tr_pred_prob_cat = model_catb.predict_proba(X_train)
te_pred_prob_cat = model_catb.predict_proba(X_test)


preds_catb = np.where(model_catb.predict_proba(X_train)[:,1] > level_threshold(y_train, tr_pred_prob_cat), 1, 0)
preds_test_catb = np.where(model_catb.predict_proba(X_test)[:,1] > level_threshold(y_test, te_pred_prob_cat), 1, 0)

Threshold 0.2 -- 0.8387073197452012
Threshold 0.25 -- 0.7828672911630894
Threshold 0.3 -- 0.7789469608285783
Threshold 0.35 -- 0.7789469608285783
Threshold 0.39999999999999997 -- 0.7486379357700673
Threshold 0.44999999999999996 -- 0.6253732834840834
Threshold 0.49999999999999994 -- 0.6098973011834099
Threshold 0.5499999999999999 -- 0.5890642701433892
Threshold 0.6 -- 0.5685768990648964
Threshold 0.65 -- 0.564134228079036
Threshold 0.7000000000000001 -- 0.5534534373380503
Threshold 0.7500000000000001 -- 0.5126263195654553
---Optimum Threshold ---- 0.2 --ROC-- 0.8387073197452012
Threshold 0.2 -- 0.8424033679769135
Threshold 0.25 -- 0.7882895472010844
Threshold 0.3 -- 0.7834126803020155
Threshold 0.35 -- 0.7834126803020155
Threshold 0.39999999999999997 -- 0.7522365673901059
Threshold 0.44999999999999996 -- 0.6266367225580579
Threshold 0.49999999999999994 -- 0.609396619987738
Threshold 0.5499999999999999 -- 0.5874161401742516
Threshold 0.6 -- 0.5657005874686114
Threshold 0.65 -- 0.56084118

In [66]:
print('-------TRAIN-----')
print(classification_report(y_train, preds_catb, target_names=target_names))
print('---------TEST----------')
print(classification_report(y_test, preds_test_catb, target_names=target_names))

-------TRAIN-----
              precision    recall  f1-score   support

           0       0.98      0.93      0.95    516936
           1       0.44      0.75      0.56     40271

    accuracy                           0.91    557207
   macro avg       0.71      0.84      0.75    557207
weighted avg       0.94      0.91      0.92    557207

---------TEST----------
              precision    recall  f1-score   support

           0       0.98      0.93      0.95    254531
           1       0.44      0.76      0.56     19915

    accuracy                           0.91    274446
   macro avg       0.71      0.84      0.76    274446
weighted avg       0.94      0.91      0.92    274446



In [67]:
f1_score_train = f1_score(y_train, preds_catb, average='macro')
f1_score_test = f1_score(y_test, preds_test_catb, average='macro')

print('f1_score for TRAIN:', f1_score_train)
print('f1_score for TEST:', f1_score_test)

f1_score for TRAIN: 0.7536915082934894
f1_score for TEST: 0.7554829427026839


In [68]:
results = results.append({'model': 'CatBoostClassifier',
                       'f1_score_(average=macro)': round(f1_score_test,5)}, ignore_index=True)

In [69]:
results

Unnamed: 0,model,f1_score_(average=macro)
0,LGBMClassifier,0.76115
1,XGBClassifier,0.7563
2,CatBoostClassifier,0.75548


Проводила сравнение 5-ти моделей: 
    Logistic Regression;
    RandomForestClassifier;
    LGBMClassifier;
    XGBClassifier;
    CatBoostClassifier.
    
Было принято решение работать с данным без разбиения на подвыборки (например по виду услуги, загружать определенный % / количество строк из файлов и пр.). 
Логистическа регрессия давала всего 0.60 на train и на test (да еще для неё надо данные стандартизировать) - долго и небольшой f1_score.
Random Forest Classifier - с ним вообще всё сложно. Слишком большие данные для такой модели.
Далее бустинговые модели: они показали лучшие результаты как по скорости, так и по уровню f1_score (поэтому таблица только по ним составлена). И еще одно преимущество: они не нуждаются в стандартизированных (нормализованных) данных и смогли довольно быстро обработать немаленький объем данных.
У бустинговых моделей f1_score примерно на одном уровне. Немного лучше данный показатель у LGBMClassifier (причем за меньшее количество итераций).
 При предложении услуг нас интересуют убеждаемые клиенты (без нашей коммуникации они ничего не купят) и клиенты, которых лучше не беспокоить (коммуникация с ними может привести к тому, что они отпишутся/отключат уже предоставляемые услуги). Коммуникация с "потерянными" и лояльными клиентами - это лишняя трата бюджета (и времени). 


In [70]:
# сохранение модели в формате pkl
with open('model.pkl', 'wb') as f:
    pickle.dump(model_lgbm, f)
    
## загрузка модели    
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [71]:
predictions = model.predict_proba(test_df)[:,1]

answers_test = pd.DataFrame({'buy_time':test.buy_time, 'id':test.id, 'vas_id':test.vas_id, 'target': predictions})

In [72]:
answers_test.to_csv('answers_test.csv', index=False)