### Importing Libs & Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr
from scipy.cluster import hierarchy as hc
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns

In [None]:
Train=pd.read_pickle('TrainAgg'); Test=pd.read_pickle('TestAgg')

### Splitting X & Y

In [None]:
X=Train.loc[:,Train.columns!='target']; Y=Train.loc[:,'target']

### Fixing issues with X

This was just to remove time stamps from the dates and convert the TimeSpent feature to an integer

In [None]:
X.first_active_month = pd.to_datetime(X.first_active_month)

X.first_active_month = X.first_active_month.dt.date

X.first_active_month.describe()

X.LatestPurDate = pd.to_datetime(X.LatestPurDate)

X.LatestPurDate = X.LatestPurDate.dt.date

X.LatestPurDate.describe()

X.drop('card_id',axis=1,inplace=True)

X.TimeSpent=X.TimeSpent.dt.days;X.TimeSpent

In [None]:
# Turning dates into Categories

X.first_active_month=X.first_active_month.astype('category')

X.first_active_month=X.first_active_month.cat.codes

X.LatestPurDate=X.LatestPurDate.astype('category')

X.LatestPurDate=X.LatestPurDate.cat.codes

In [None]:
X.feature_1=X.feature_1.astype('category');X.feature_2=X.feature_2.astype('category');X.feature_3=X.feature_3.astype('category');X.dtypes

X.drop(labels=['LatestPurDate','first_active_month'], axis=1, inplace=True)

### Applying Random Forest

In [None]:
RF=RandomForestRegressor(n_estimators=100, n_jobs=-1, max_features=0.5, verbose=True, oob_score=True)

RFmodel=RF.fit(X,Y)

RF.oob_score_

#### Feature Importance

In [None]:
FI = RF.feature_importances_
FI=pd.Series(FI,index=X.columns).sort_values(ascending=False);FI

### Preparing test set

In [None]:
Xtest=Test

Xtest.drop(labels=['first_active_month','card_id','LatestPurDate'],axis=1,inplace=True)

Xtest.TimeSpent=Xtest.TimeSpent.dt.days

Xtest.feature_1=Xtest.feature_1.astype('category');Xtest.feature_2=Xtest.feature_2.astype('category');Xtest.feature_3=Xtest.feature_3.astype('category')

Xtest.TimeSpent = Xtest.TimeSpent.replace(to_replace=np.nan,value=0)

Xtest.CLV = Xtest.CLV.replace(to_replace=np.nan,value=0)

Xtest.CLV = Xtest.CLV.replace(to_replace=[np.inf,-np.inf],value=0)

### Predicting on Xtest

In [None]:
Ypred=RF.predict(Xtest)

Ypred = pd.Series(Ypred)

Ypred.to_csv('Ypred.csv')

Ypred.head()

### Exporting X & Xtest for future use

In [None]:
X.to_pickle('XtrainLatest'); Y.to_pickle('YtrainLatest'); Xtest.to_pickle('XtestLatest')

### Checking correlation

In [None]:
corr = np.round(spearmanr(X).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=X.columns, orientation='left', leaf_font_size=16)
plt.show()

### Removing collinearity

In [None]:
X.drop(labels=['Cat3TotB'],axis=1,inplace=True)

Xtrain2 = Xtrain.loc[:, ['AvgPurAmt','CLV','TimeSpent','purchase_amount']]

### RF again after removing collinearity & unimportant features

In [None]:
RF=RandomForestRegressor(n_estimators=250, n_jobs=-1, max_features='log2', verbose=True, oob_score=True)

RFmodel=RF.fit(Xtrain2,Ytrain)

RF.oob_score_

Ypred = RF.predict(Xtrain2)

Ypred.shape

MSE=mean_squared_error(Ytrain,Ypred)
RMSE=sqrt(MSE);RMSE

#### Feature Importance

In [None]:
FI = RF.feature_importances_
FI=pd.Series(FI,index=Xtrain2.columns).sort_values(ascending=False);FI