In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_parquet('Data/train.parquet')
test = pd.read_parquet('Data/test.parquet')
train.set_index('customer_ID', inplace=True)
test.set_index('customer_ID', inplace=True)

labels = pd.read_pickle('Data/train_labels.pkl')

columns = train.columns
cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
num_columns = [c for c in columns if c not in cat_columns]

train.shape, labels.shape, test.shape

((5531451, 189), (458913, 1), (11363762, 189))

In [3]:
train.drop(cat_columns, axis=1, inplace=True)
test.drop(cat_columns, axis=1, inplace=True)

train['time'] = pd.to_datetime(train['S_2'])                                                                                                                                                                                                    
train['time']  = train['time'].astype(np.int64) / 10**18
test['time'] = pd.to_datetime(test['S_2'])                                                                                                                                                                                                    
test['time']  = test['time'].astype(np.int64) / 10**18

train.drop(['S_2'], axis=1, inplace=True)
test.drop(['S_2'], axis=1, inplace=True)

In [4]:
num_columns.remove('S_2')

In [5]:
tr_slope = pd.DataFrame(index=train.index.unique())
te_slope = pd.DataFrame(index=test.index.unique())

for count, i in enumerate(num_columns):
    print(i, end=', ')
    if count > 0 and count % 15 == 0: print('')
    tr_slope['slope_' + i] = train.groupby('customer_ID').apply(lambda v: linregress(v.time, v[i])[0])
    te_slope['slope_' + i] = test.groupby('customer_ID').apply(lambda v: linregress(v.time, v[i])[0])

P_2, D_39, B_1, B_2, R_1, S_3, D_41, B_3, D_42, D_43, D_44, B_4, D_45, B_5, R_2, D_46, 
D_47, D_48, D_49, B_6, B_7, B_8, D_50, D_51, B_9, R_3, D_52, P_3, B_10, D_53, S_5, 
B_11, S_6, D_54, R_4, S_7, B_12, S_8, D_55, D_56, B_13, R_5, D_58, S_9, B_14, D_59, 
D_60, D_61, B_15, S_11, D_62, D_65, B_16, B_17, B_18, B_19, B_20, S_12, R_6, S_13, B_21, 
D_69, B_22, D_70, D_71, D_72, S_15, B_23, D_73, P_4, D_74, D_75, D_76, B_24, R_7, D_77, 
B_25, B_26, D_78, D_79, R_8, R_9, S_16, D_80, R_10, R_11, B_27, D_81, D_82, S_17, R_12, 
B_28, R_13, D_83, R_14, R_15, D_84, R_16, B_29, S_18, D_86, D_87, R_17, R_18, D_88, B_31, 
S_19, R_19, B_32, S_20, R_20, R_21, B_33, D_89, R_22, R_23, D_91, D_92, D_93, D_94, R_24, 
R_25, D_96, S_22, S_23, S_24, S_25, S_26, D_102, D_103, D_104, D_105, D_106, D_107, B_36, B_37, 
R_26, R_27, D_108, D_109, D_110, D_111, B_39, D_112, B_40, S_27, D_113, D_115, D_118, D_119, D_121, 
D_122, D_123, D_124, D_125, D_127, D_128, D_129, B_41, B_42, D_130, D_131, D_132, D_133, R_28, 

In [7]:
del train, test; gc.collect()

0

In [8]:
train = pd.read_pickle('Output/train.pkl')
test = pd.read_pickle('Output/test.pkl')
train.shape, test.shape

((458913, 1103), (924621, 1103))

In [9]:
train = pd.concat([train, tr_slope], axis = 1)
test = pd.concat([test, te_slope], axis = 1)

In [11]:
train.to_parquet('Data/train_all_slopes.parquet')
test.to_parquet('Data/test_all_slopes.parquet')