In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import linregress
from sklearn.decomposition import PCA
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_parquet('Data/train.parquet')
train.set_index('customer_ID', inplace=True)

columns = train.columns
cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
num_columns = [c for c in columns if c not in cat_columns]

train.shape

In [None]:
del train; gc.collect()

In [None]:
train.drop(cat_columns, axis=1, inplace=True)
train['time'] = pd.to_datetime(train['S_2'])                                                                                                                                                                                                    
train['time']  = train['time'].astype(np.int64) / 10**18
train.drop(['S_2'], axis=1, inplace=True)
num_columns.remove('S_2')

In [None]:
indx_series = train.groupby(train.index).count()['time'].apply(lambda x: x>2)
indx = indx_series[indx_series==True].index
train = train.loc[indx]
train.shape

In [None]:
def process_fun(x):
    global counter
    counter  += 1
    if counter % 1000 == 0: print(counter, end=', ')
    pca_out = PCA(n_components=2).fit_transform(x[num_columns].fillna(x[num_columns].median()).fillna(0))
    s1 = linregress(x['time'], pca_out[:, 0])[0]
    s2 = linregress(x['time'], pca_out[:, 1])[0]
    return s1, s2

In [None]:
counter = 0
tr_pca_slope = train.groupby(train.index).apply(lambda x: process_fun(x))

In [None]:
pca0_slope = tr_pca_slope.apply(lambda x: x[0]).values
pca1_slope = tr_pca_slope.apply(lambda x: x[1]).values

In [None]:
tr_df = pd.DataFrame({'pca0_slope': pca0_slope, 'pca1_slope': pca1_slope}, index=train.index.unique())
tr_df

In [None]:
scaler = StandardScaler()
tr_pca_slope_scaled = scaler.fit_transform(tr_df)
tr_pca_slope_scaled_df = pd.DataFrame(tr_pca_slope_scaled, index=tr_df.index, columns=['pca_s1', 'pca_s2'])
tr_pca_slope_scaled_df

In [None]:
train = pd.read_parquet('Data/train_all_slopes_corr.parquet')
train = pd.concat([train, tr_pca_slope_scaled_df], axis=1)
train.to_parquet('Data/train_all_slopes_corr_pcaslope.parquet')
train.shape

In [None]:
train

In [None]:
del train, tr_pca_slope, tr_pca_slope_scaled, tr_pca_slope_scaled_df; gc.collect()

### test

In [None]:
test = pd.read_parquet('Data/test.parquet')
test.set_index('customer_ID', inplace=True)
test.drop(cat_columns, axis=1, inplace=True)
test['time'] = pd.to_datetime(test['S_2'])                                                                                                                                                                                      
test['time']  = test['time'].astype(np.int64) / 10**18
test.drop(['S_2'], axis=1, inplace=True)
num_columns.remove('S_2')

In [None]:
indx_series = test.groupby(test.index).count()['time'].apply(lambda x: x>2)
indx = indx_series[indx_series==True].index
test = test.loc[indx]
test.shape

In [None]:
counter = 0
te_pca_slope = test.groupby(test.index).apply(lambda x: process_fun(x))

In [None]:
pca0_slope = te_pca_slope.apply(lambda x: x[0]).values
pca1_slope = te_pca_slope.apply(lambda x: x[1]).values

In [None]:
te_df = pd.DataFrame({'pca0_slope': pca0_slope, 'pca1_slope': pca1_slope}, index=test.index.unique())

In [None]:
scaler = StandardScaler()
te_pca_slope_scaled = scaler.fit_transform(te_df)
te_pca_slope_scaled_df = pd.DataFrame(te_pca_slope_scaled, index=te_df.index, columns=['pca_s1', 'pca_s2'])
te_pca_slope_scaled_df

In [None]:
test = pd.read_parquet('Data/test_all_slopes_corr.parquet')
test = pd.concat([test, te_pca_slope_scaled_df], axis=1)
test.to_parquet('Data/test_all_slopes_corr_pcaslope.parquet')
test.shape