# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Data Analysis

In [2]:
df_train = pd.read_csv('problem_2_train.csv')

In [3]:
len(df_train.mobmake.unique())

38

In [4]:
df_train.head()

Unnamed: 0,srno,mobmake,hubid,hubpincode,pickuppartnerid,deliverypartnerid,insuranceCompanyId,custpincodecategory,claimtype,custpincode,memcreateddate,mobilepurchasedate,claimintdate,servicecntrid,pickupStartdate,tat_in_days
0,2851218,AN,7,500004,233,233,131,A,ADLD,500001,2017-04-27,2016-12-02,2018-02-08,4403,2018-02-13,5
1,2838330,AJ,10,226010,233,233,131,A,ADLD,226022,2017-03-06,2017-03-05,2018-02-04,4444,2018-02-08,12
2,2835781,AJ,2,110018,233,233,228,A,ADLD,110018,2018-02-01,2018-01-27,2018-02-03,4388,2018-02-12,9
3,2838589,AM,2,110018,233,233,228,B,ADLD,124001,2018-01-17,2018-01-31,2018-02-04,4092,2018-02-14,9
4,2855214,AN,21,110015,233,233,131,A,ADLD,201301,2018-01-23,2018-01-20,2018-02-09,4580,2018-02-14,8


In [5]:
df_train.columns

Index(['srno', 'mobmake', 'hubid', 'hubpincode', 'pickuppartnerid',
       'deliverypartnerid', 'insuranceCompanyId', 'custpincodecategory',
       'claimtype', 'custpincode', 'memcreateddate', 'mobilepurchasedate',
       'claimintdate', 'servicecntrid', 'pickupStartdate', 'tat_in_days'],
      dtype='object')

# Converting string dates to datetime object

In [6]:
df_train.mobilepurchasedate = pd.to_datetime(df_train.mobilepurchasedate)
df_train.claimintdate = pd.to_datetime(df_train.claimintdate)
df_train.memcreateddate = pd.to_datetime(df_train.memcreateddate)
df_train.pickupStartdate = pd.to_datetime(df_train.pickupStartdate)

# Converting categorical data to numeric values

In [7]:
df_train['mobmake'] = pd.factorize(df_train.mobmake)[0]
df_train['custpincodecategory'] = pd.factorize(df_train.custpincodecategory)[0]
df_train['claimtype'] = pd.factorize(df_train.claimtype)[0]

In [8]:
pd.DataFrame(df_train.mobmake.value_counts()).index

Int64Index([ 0,  1,  3,  4,  5,  7,  2, 19,  9,  6,  8, 14, 15, 17, 10, 12, 22,
            11, 16, 13, 18, 24, 26, 28, 31, 21, 20, 25, 23, 32, 35, 36, 37, 30,
            34, 27, 29, 33],
           dtype='int64')

In [9]:
len(df_train.columns)

16

In [10]:
X = df_train.drop(['memcreateddate', 'mobilepurchasedate',
       'claimintdate', 'pickupStartdate', 'tat_in_days'], axis = 1)

In [11]:
X.shape

(6250, 11)

In [12]:
y = df_train['tat_in_days']

# Machine Learning

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1)

# Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
y_pred = linreg.predict(X_test)

In [17]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

4.75625478848308


# Gradient Boost

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

In [19]:
gbreg = GradientBoostingRegressor(n_estimators=500)
gbreg.fit(X_train, y_train)
y_pred = gbreg.predict(X_test)

In [20]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

4.432140554352705


# Ridge

In [21]:
from sklearn.linear_model import Ridge

In [22]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train) 
y_pred = ridge.predict(X_test)

In [23]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

4.753812226632618


# Lasso

In [24]:
from sklearn.linear_model import Lasso

In [25]:
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train) 
y_pred = lasso.predict(X_test)

In [26]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

4.935588184223248


# KNN

In [27]:
from sklearn.neighbors import KNeighborsRegressor

In [28]:
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train) 
y_pred = neigh.predict(X_test)

In [29]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

5.565968594768807


In [30]:
models = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge', 'Lasso','Gradient Boost','KNN Regressor'],
    'Training RMSE': [ np.sqrt(metrics.mean_squared_error(y_train, linreg.predict(X_train))), np.sqrt(metrics.mean_squared_error(y_train, ridge.predict(X_train))), np.sqrt(metrics.mean_squared_error(y_train, lasso.predict(X_train))), np.sqrt(metrics.mean_squared_error(y_train, gbreg.predict(X_train))), np.sqrt(metrics.mean_squared_error(y_train, neigh.predict(X_train)))],
    'Test RMSE': [ np.sqrt(metrics.mean_squared_error(y_test, linreg.predict(X_test))), np.sqrt(metrics.mean_squared_error(y_test, ridge.predict(X_test))), np.sqrt(metrics.mean_squared_error(y_test, lasso.predict(X_test))), np.sqrt(metrics.mean_squared_error(y_test, gbreg.predict(X_test))), np.sqrt(metrics.mean_squared_error(y_test, neigh.predict(X_test)))]
    })
models.sort_values(by='Test RMSE')

Unnamed: 0,Model,Training RMSE,Test RMSE
3,Gradient Boost,3.263186,4.432141
1,Ridge,4.471066,4.753812
0,Linear Regression,4.469691,4.756255
2,Lasso,4.634433,4.935588
4,KNN Regressor,3.048248,5.565969


# Loading the test data

In [31]:
df_test = pd.read_csv('problem_2_test.csv')

In [32]:
df_test.head()

Unnamed: 0,srno,mobmake,hubid,hubpincode,pickuppartnerid,deliverypartnerid,insuranceCompanyId,custpincodecategory,claimtype,custpincode,memcreateddate,mobilepurchasedate,claimintdate,servicecntrid,pickupStartdate
0,2829088,AN,3,560011,233,233,131,A,ADLD,560093,2017-07-30,2017-07-27,2018-02-01,4467,2018-02-06
1,2829866,AJ,2,110018,233,233,228,A,ADLD,110034,2017-07-17,2017-01-25,2018-02-01,4388,2018-02-08
2,2894489,AJ,3,560011,233,233,228,C,ADLD,590014,2018-01-12,2018-01-11,2018-02-21,4460,2018-02-28
3,2876708,AJ,2,110018,233,233,228,B,ADLD,250110,2017-08-07,2017-05-21,2018-02-16,4388,2018-02-26
4,2890283,AN,22,400104,233,233,131,C,ADLD,496001,2017-11-10,2017-09-22,2018-02-20,4581,2018-02-23


In [33]:
df_test['mobmake'] = pd.factorize(df_test.mobmake)[0]
df_test['custpincodecategory'] = pd.factorize(df_test.custpincodecategory)[0]
df_test['claimtype'] = pd.factorize(df_test.claimtype)[0]

In [34]:
X_test = df_test.drop(['memcreateddate', 'mobilepurchasedate',
       'claimintdate', 'pickupStartdate'], axis = 1)

In [35]:
len(X_test.columns)

11

In [36]:
gbreg = GradientBoostingRegressor(n_estimators=500)
gbreg.fit(X, y)
y_pred = pd.DataFrame(gbreg.predict(X_test))

In [37]:
y_pred.to_csv('test_data_answers.csv', index=False)

In [38]:
df_test['Dataset'] = y_pred
df_test.to_csv('test_data with answers.csv')