# Machine Learning Challenge - Round -1

# Imports

In [53]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

Path for the data

In [55]:
PATH = "data/yesbank/"

In [56]:
df_raw = pd.read_csv(f'{PATH}ml2_train.csv', low_memory=False, 
                     parse_dates=["Established_Date"],dayfirst = True)

df_test = pd.read_csv(f'{PATH}ml2_test.csv', low_memory=False,
                    parse_dates = ["Established_Date"],dayfirst = True)

it's important to look at your data, to make sure you understand the format, how it's stored, what type of values it holds, etc.

In [57]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
        display(df)

In [58]:
display_all(df_raw.tail().T)

Unnamed: 0,3785,3786,3787,3788,3789
Serial_Number,3786,3787,3788,3789,3790
Main_Office,0,0,0,0,0
Branch_Number,7981,7982,7984,7988,7989
Established_Date,2015-03-11 00:00:00,2016-02-02 00:00:00,2016-03-15 00:00:00,2016-01-01 00:00:00,2016-01-01 00:00:00
Acquired_Date,,,,,
City,Compton,Las Vegas,Irvine,New Orleans,Buffalo
County,Los Angeles,Clark,Orange,Orleans,Erie
State,CA,NV,CA,LA,NY
Deposits_2010,,,,,
Deposits_2011,,,,,


In [59]:
display_all(df_raw.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
Serial_Number,3790,,,,,,1895.5,1094.22,1.0,948.25,1895.5,2842.75,3790.0
Main_Office,3790,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Branch_Number,3790,,,,,,5390.81,1549.11,2866.0,4062.25,5256.5,6858.25,7989.0
Established_Date,3790,2027.0,1919-01-01 00:00:00,129.0,1825-01-01 00:00:00,2016-10-05 00:00:00,,,,,,,
Acquired_Date,2297,5.0,09/25/2008,1528.0,,,,,,,,,
City,3790,1489.0,Chicago,99.0,,,,,,,,,
County,3790,299.0,Los Angeles,298.0,,,,,,,,,
State,3790,25.0,CA,1003.0,,,,,,,,,
Deposits_2010,3050,,,,,,112213.0,2797590.0,0.0,18932.0,35628.0,66072.8,153577000.0
Deposits_2011,3212,,,,,,125514.0,3447380.0,0.0,20132.8,37183.0,66947.0,194388000.0


# Initial Processing

This dataset contains a mix of continuous and categorical variables.
The below method extracts particular date fields from a complete datetime for the purpose of constructing categoricals.

In [60]:
add_datepart(df_raw,'Established_Date')
df_raw.Established_Year.head()
add_datepart(df_test,'Established_Date')

we call the function train_cats to convert strings to pandas categories.

In [61]:
train_cats(df_raw)
train_cats(df_test)

The below step is to replace the text to numericals

In [62]:
df_raw.City.cat.categories
df_raw.County.cat.categories
df_raw.State.cat.categories
df_test.City.cat.categories
df_test.County.cat.categories
df_test.State.cat.categories

Index(['AZ', 'CT', 'FL', 'IL', 'KY', 'LA', 'MI', 'NJ', 'NY', 'OH', 'TX', 'UT',
       'WI', 'WV'],
      dtype='object')

output : Index(['AZ', 'CT', 'FL', 'IL', 'KY', 'LA', 'MI', 'NJ', 'NY', 'OH', 'TX', 'UT',
       'WI', 'WV'],
      dtype='object')

In [63]:
df_raw.City = df_raw.City.cat.codes
df_raw.County = df_raw.County.cat.codes
df_raw.State = df_raw.State.cat.codes
df_test.City = df_test.City.cat.codes
df_test.County = df_test.County.cat.codes
df_test.State = df_test.State.cat.codes

Since we have lots of missing values, which we can't pass directly to a random forest.

In [64]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))
display_all(df_test.isnull().sum().sort_index()/len(df_test))

Acquired_Date                   0.393931
Branch_Number                   0.000000
City                            0.000000
County                          0.000000
Deposits_2010                   0.195251
Deposits_2011                   0.152507
Deposits_2012                   0.086807
Deposits_2013                   0.046174
Deposits_2014                   0.014776
Deposits_2015                   0.005013
Deposits_2016                   0.000000
Established_Day                 0.000000
Established_Dayofweek           0.000000
Established_Dayofyear           0.000000
Established_Elapsed             0.000000
Established_Is_month_end        0.000000
Established_Is_month_start      0.000000
Established_Is_quarter_end      0.000000
Established_Is_quarter_start    0.000000
Established_Is_year_end         0.000000
Established_Is_year_start       0.000000
Established_Month               0.000000
Established_Week                0.000000
Established_Year                0.000000
Main_Office     

Acquired_Date                   0.075169
Branch_Number                   0.000000
City                            0.000000
County                          0.000000
Deposits_2010                   0.000000
Deposits_2011                   0.000000
Deposits_2012                   0.000000
Deposits_2013                   0.000000
Deposits_2014                   0.000000
Deposits_2015                   0.000000
Established_Day                 0.000000
Established_Dayofweek           0.000000
Established_Dayofyear           0.000000
Established_Elapsed             0.000000
Established_Is_month_end        0.000000
Established_Is_month_start      0.000000
Established_Is_quarter_end      0.000000
Established_Is_quarter_start    0.000000
Established_Is_year_end         0.000000
Established_Is_year_start       0.000000
Established_Month               0.000000
Established_Week                0.000000
Established_Year                0.000000
Main_Office                     0.000000
Serial_Number   

# Preprocessing

Since we have lots of training data compared to test data available, the training data has been splitted to required number in order to overcome overfitting.

In [39]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 2167  # same as Kaggle's test set size
n_trn = len(df_raw)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
raw_train.shape
df_raw = raw_train

We'll replace categories with their numeric codes, handle missing continuous values, and split the dependent variable into a separate variable.

In [66]:
df, y, nas = proc_df(df_raw, 'Deposits_2016')
df1, y1, nas1 = proc_df(df_test)
df

Unnamed: 0,Serial_Number,Main_Office,Branch_Number,Acquired_Date,City,County,State,Deposits_2010,Deposits_2011,Deposits_2012,...,Established_Is_quarter_start,Established_Is_year_end,Established_Is_year_start,Established_Elapsed,Deposits_2010_na,Deposits_2011_na,Deposits_2012_na,Deposits_2013_na,Deposits_2014_na,Deposits_2015_na
0,1,0,2866,5,1383,279,24,21386.0,23981.0,24825.0,...,False,False,False,-1880323200,False,False,False,False,False,False
1,2,0,2867,5,451,277,24,55454.0,56564.0,64732.0,...,False,False,False,-1888444800,False,False,False,False,False,False
2,3,0,2868,5,137,279,24,90882.0,104300.0,125038.0,...,False,False,False,-1974758400,False,False,False,False,False,False
3,4,0,2869,5,1005,279,24,45674.0,49288.0,53251.0,...,False,False,False,-1058227200,False,False,False,False,False,False
4,5,0,2870,5,1401,279,24,64181.0,72217.0,69920.0,...,False,False,False,-211334400,False,False,False,False,False,False
5,6,0,2871,5,1401,279,24,62558.0,67728.0,78847.0,...,False,False,False,-29289600,False,False,False,False,False,False
6,7,0,2872,5,137,279,24,78437.0,87150.0,102144.0,...,False,False,False,-222220800,False,False,False,False,False,False
7,8,0,2873,5,859,279,24,84622.0,96048.0,103946.0,...,True,False,False,-307756800,False,False,False,False,False,False
8,9,0,2874,5,936,279,24,48467.0,48696.0,54702.0,...,False,False,False,-1196121600,False,False,False,False,False,False
9,10,0,2875,5,159,279,24,49281.0,48689.0,48674.0,...,False,False,False,-721785600,False,False,False,False,False,False


Now, the data is ready to fit the model using random forest regressor

In [67]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)



0.9206251464882599

Output: 0.9206251464882599

The output is the coefficient of R^2

This shows that the train data given is efficient for the prediction process of the test data.

## Prediction of Test data 

In [17]:
deposit = m.predict(df1)
deposit

array([1.65417e+08, 3.58472e+05, 2.86499e+05, ..., 1.06214e+05, 8.45572e+04, 2.75120e+04])

## Clustering

In [18]:
A=[];
B=[];
C=[];
S1=[];
S2=[];
S3=[];
clu_a =[];
for i in range(len(deposit)):
    if (deposit[i]>=20000 and deposit[i]<=90000):
        A.append(deposit[i])
        S1.append(i+1)
    if (deposit[i] < 20000):
        B.append(deposit[i])
        S2.append(i+1)
    if (deposit[i] >90000):
        C.append(deposit[i])
        S3.append(i+1)    
deposit = A + B + C
Serialnum = S1 + S2 + S3



## Creation of csv file

In [20]:
import csv

with open('submissionlast.csv','w',newline = '') as f:
    thewriter = csv.writer(f)
    thewriter.writerow(['Serial Number','2016 Deposits'])
    for i in range(len(deposit)):
        thewriter.writerow([Serialnum[i],deposit[i]])
