# Linear regression model training

In [None]:
#using train_hire_stats to train model
#step 1.read file
#step 2.feature expansion
#step 3.get dummies -> encoding categorical data
#step 4.train model and compute rmse

# model training: train_hir_stats

In [None]:
#step 1: read file
import pandas as pd
from pandas import read_csv;

In [None]:
ths = read_csv('train_hire_stats.csv')

In [None]:
#including 3 features and 1 label
#Zone_ID, Date, Hour_slot are all categorical features
ths.head(5)

In [None]:
#cast type of Date from object to datetime
ths['Date'] = pd.to_datetime(ths.Date)

In [None]:
ths.dtypes

In [None]:
#step 2: feature expansion
#separate 'weekday', 'month' features from ths['Date']
wd = ths['Date'].dt.weekday
m = ths['Date'].dt.month

In [None]:
ths.insert(2,"month",m)
ths.insert(3,"weekday",wd)
ths.head(5)

In [None]:
#including 4 features and 1 label
#Zone_ID, month, weekday, Hour_slot are all categorical features
#Hire_count is label
ths = ths.drop(['Date'], axis = 1)
ths.head(5)

In [None]:
#step 3: get dummies
#Predicting Hire_count is a regression problem
#therefore, using pandas.get_dummies to encode categorical features
ths = pd.get_dummies(ths, 
                   prefix=None, 
                  prefix_sep='_', 
                  dummy_na=False, 
                   columns=['Zone_ID','month','weekday','Hour_slot'], 
                   sparse=False, 
                   drop_first=False, 
                   dtype=None)

In [None]:
ths.head(5)

In [None]:
#load features into X
#load label into y
X = ths.iloc[:,1:69]
y = ths['Hire_count']

In [None]:
#step 4.train model and compute rmse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

In [None]:
#split dataset into train and test
#test size is 20%
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [None]:
#fit Linear Regression model
linreg = LinearRegression()
linreg.fit(X_train,y_train)

In [None]:
#predict labels of test data 
y_pred = linreg.predict(X_test)

In [None]:
#computing rmse
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

# predict data: test_hire_stats

In [None]:
#step 1: read file
#step 2: feature expansion
#step 3: get dummies
#step 4: add columns to fit in model
#step 5: using model to predict

In [None]:
#step 1: read file
test = read_csv('test_hire_stats.csv')

In [None]:
test.head(5)

In [None]:
test['Date'] = pd.to_datetime(test.Date)

In [None]:
#step 2: feature expansion
wd = test['Date'].dt.weekday
m = test['Date'].dt.month

In [None]:
test.insert(2,"month",m)
test.insert(3,"weekday",wd)
test.head(5)

In [None]:
test['month'].unique()

In [None]:
test = test.drop(['Date', 'Test_ID'], axis = 1)
test.head(5)

In [None]:
#step 3: get dummies
test = pd.get_dummies(test, 
                   prefix=None, 
                  prefix_sep='_', 
                  dummy_na=False, 
                   columns=['Zone_ID','month','weekday','Hour_slot'], 
                   sparse=False, 
                   drop_first=False, 
                   dtype=None)

In [None]:
#step 4: add columns to fit in model
#because our model has more ['Zone_ID'] features than test dataset
test.insert(1, 'Zone_ID_1', 0)
test.insert(2, 'Zone_ID_2', 0)
test.insert(3, 'Zone_ID_3', 0)
test.insert(4, 'Zone_ID_4', 0)
test.insert(5, 'Zone_ID_5', 0)
test.insert(6, 'Zone_ID_6', 0)
test.insert(10, 'Zone_ID_10', 0)
test.insert(11, 'Zone_ID_11', 0)
test.insert(15, 'Zone_ID_15', 0)
test.insert(16, 'Zone_ID_16', 0)
test.insert(20, 'Zone_ID_20', 0)
test.insert(21, 'Zone_ID_21', 0)
test.insert(22, 'Zone_ID_22', 0)
test.insert(23, 'Zone_ID_23', 0)
test.insert(24, 'Zone_ID_24', 0)
test.insert(25, 'Zone_ID_25', 0)
test.insert(26, 'month_1', 0)
test.insert(28, 'month_3', 0)
test.insert(29, 'month_4', 0)
test.insert(30, 'month_5', 0)
test.insert(31, 'month_6', 0)
test.insert(32, 'month_7', 0)
test.insert(33, 'month_8', 0)
test.insert(34, 'month_9', 0)
test.insert(35, 'month_10', 0)
test.insert(36, 'month_11', 0)
test.insert(37, 'month_12', 0)

In [None]:
pd.set_option('display.max_columns', None)
test.head(10)

In [None]:
test.shape

In [None]:
X = test.iloc[:,1:69]

In [None]:
#step 5: using model to predict
y_pred = linreg.predict(X)

In [None]:
y_pred

In [None]:
test = test.drop('Hire_count', axis = 1)

In [None]:
test.insert(0, 'Hire_count', y_pred)

In [None]:
test.head(10)

In [None]:
test[test < 0] = 0

In [None]:
test.head(10)

In [None]:
final= pd.read_csv("test_hire_stats.csv")

In [None]:
final.head()

In [None]:
#fill prediction result into test dataset
final['Hire_count']= test['Hire_count']

In [None]:
final.head()

In [None]:
#final.to_csv("remake_ver.csv",index=False)