# M6 - WEEK 6 | PROJECT: Kaggle Competition - Predict Traffic Congestion

>"Our task here is to predict traffic congestion based on aggregate measures of  stopping distance and waiting times at intersections in 4 major US cities."

## Goal

 -   process data
 -  run model
 - submit predictions

### Process data

We start by importing all useful modules:

In [1]:
import pandas as pd # import pandas
import numpy as np # import numpy

# encoding and splitting
from sklearn import preprocessing # preprocessing for Label and OneHotEncode
from sklearn.model_selection import train_test_split #for train test split

# models
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import Lasso
#from sklearn.linear_model import RidgeCV
#from sklearn.linear_model import ElasticNet
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor


## other necessary models
#from sklearn.linear_model import SGDRegressor
#from sklearn.ensemble import AdaBoostRegressor
#from sklearn.ensemble import GradientBoostingRegressor
#from xgboost.sklearn import XGBRegressor

## import Gridsearch for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# used metrics
from sklearn.metrics import mean_squared_error # import mean squared error as we are suppose to use rmse

#silence warning
import warnings
warnings.filterwarnings("ignore") 

We will start by importing our datasets:

In [2]:
train = pd.read_csv('data/train.csv') # load train
test = pd.read_csv('data/test.csv') #load test
subdf  = pd.read_csv('data/sample_submission.csv') # load submission file

Then we define a function to process data:

In [3]:
def dataproc(df):
    _df = df
    
    # 'IntersectionId', 'EntryStreetName', 'ExitStreetName', 'Path',   --- removed as latitutde and longitude gives location and heading gives direction
    feat_list = ['Latitude', 'Longitude', 'EntryHeading', 'ExitHeading', 'Hour', 'Weekend', 'Month', 'City']
    num_feat_list = ['Latitude', 'Longitude', 'Hour', 'Weekend', 'Month']
    cat_feat_list = ['EntryHeading', 'ExitHeading', 'City']
    
    
    X = _df[feat_list]
        
    #select cat
    X1 = X.select_dtypes(include=[object])

    #LabelEncode
    # instantiate
    le = preprocessing.LabelEncoder()

    # fit and transform
    X2 = X1.apply(le.fit_transform)

    #OneHotEncode
    # instantiate
    enc = preprocessing.OneHotEncoder()

    # fit and transform
    enc.fit(X2)
    X3 = enc.transform(X2).toarray()

    X3p = pd.DataFrame(X3, columns = ["Cat_"+str(int(i)) for i in range(X3.shape[1])])

    Xx = pd.concat([X, X3p], axis=1) # merge new features with old
    X = Xx.drop(columns=cat_feat_list, axis=1) # drop unecessary features
    
    return X

In [4]:
X = dataproc(train) # train processed dataset

# list of targets
targ_list = ['TotalTimeStopped_p20', 'TotalTimeStopped_p40', 'TotalTimeStopped_p50', 'TotalTimeStopped_p60', 'TotalTimeStopped_p80', 'DistanceToFirstStop_p20', 'DistanceToFirstStop_p40', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p60', 'DistanceToFirstStop_p80']

#y = train[targ_list] #target dataset will all targets, non submitted included
y = train[["TotalTimeStopped_p20", "TotalTimeStopped_p50", "TotalTimeStopped_p80", "DistanceToFirstStop_p20", "DistanceToFirstStop_p50", "DistanceToFirstStop_p80"]]
stest = dataproc(test) # processed test submission data

In [5]:
del train
del test

To train our model let's split our train dataset

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=500000, test_size=5000) #, train_size=10000, test_size=500

We list all potentially eligible models:

models = {
    'knn': KNeighborsRegressor(),
    'linear regression': LinearRegression(),
    'lasso': Lasso(),
    'ridge':  RidgeCV(),
    'elasticNet': ElasticNet(),
    'random forest': RandomForestRegressor(),
    'decision tree':DecisionTreeRegressor(),
    'extra-trees': ExtraTreesRegressor(),
    'sdg regressor': SGDRegressor(),
    'ada boost regressor': AdaBoostRegressor(),
    'gradient boosting regressor': GradientBoostingRegressor(),
    'extreme boosting regressor': XGBRegressor(**{'objective':'reg:squarederror'})
}

Let's benchmark and see

rmse_matrix = pd.DataFrame(index=y.columns, columns=models.keys())

for col in y.columns:
        
    y_pred = dict()
    mse = dict()
    
    for key, model in models.items():     
        model.fit(X_train, y_train[col])                    
        y_pred[key] = model.predict(X_test)   
        rmse_matrix.at[col,key]= np.sqrt(mean_squared_error(y_test[col], model.predict(X_test)))

rmse_matrix

for i in models.keys():
    print(rmse_matrix[i].mean())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model = ExtraTreesRegressor()
param_grid={'n_estimators': range(400,2001,200), 'max_features': range(5,7)}                            
g = RandomizedSearchCV(estimator=model, param_distributions = param_grid, scoring='neg_mean_squared_error', cv=3, n_iter = 100)

fit_grid = g.fit(X_train, y_train)

print("Best: %f using %s" % (fit_grid.best_score_, fit_grid.best_params_))

model = ExtraTreesRegressor(**fit_grid.best_params_)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)
predictions = y_pred
# evaluate predictions
mse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error: %.2f" % (mse))

In [11]:
fit_grid.cv_results_

{'mean_fit_time': array([ 7.37859948, 10.27217237, 13.62608743, 15.348207  , 20.70955904,
        23.30803593, 24.71918726, 27.26777538, 31.57728386,  7.54235554,
         9.54821761, 12.93862764, 15.89907424, 18.94697682, 22.24446352,
        25.52884634, 28.82547569, 31.75870236]),
 'std_fit_time': array([0.24015818, 0.2815709 , 0.47481275, 0.27010222, 0.29178179,
        0.30401208, 0.81898882, 0.22267587, 0.08078393, 1.19410693,
        0.24039935, 0.19536236, 0.12448242, 0.29116588, 0.05597833,
        0.25454878, 0.51465075, 0.34825985]),
 'mean_score_time': array([0.35075315, 0.49085514, 0.69047173, 0.75797296, 1.03090914,
        1.10536949, 1.29155024, 1.3682162 , 1.55668481, 0.33510335,
        0.46741478, 0.61802189, 0.76196257, 0.9617552 , 1.06450407,
        1.25065748, 1.3457338 , 1.46541254]),
 'std_score_time': array([0.0155976 , 0.05834535, 0.1057241 , 0.01785973, 0.02558856,
        0.02200082, 0.10514374, 0.025956  , 0.04191156, 0.02355852,
        0.00893104, 0.0181

model = ExtraTreesRegressor()
param_grid={
        #'n_estimators': range(50,126,25),
        'max_features': range(0,25,3),
        #'min_samples_leaf': range(20,50,5),
        #'min_samples_split': range(15,36,5),
    }                            
g = GridSearchCV(estimator=model, param_grid = param_grid, scoring='neg_mean_squared_error', cv=5)

fit_grid = g.fit(X_train, y_train)

print("Best: %f using %s" % (fit_grid.best_score_, fit_grid.best_params_))

for test_mean, train_mean, param in zip(
        fit_grid.cv_results_['mean_test_score'],
        fit_grid.cv_results_['mean_train_score'],
        fit_grid.cv_results_['params']):
    print("Train: %f // Test : %f with: %r" % (train_mean, test_mean, param))
    
#model = ExtraTreesRegressor(**fit_grid.best_params_)

#model.fit(train, y_train)

#df_sub = pd.DataFrame({'ID': id_test, 'y': model.predict(test)})
#df_sub.to_csv('mercedes-submission.csv', index=False)

y_pred = model.predict(X_test)
predictions = y_pred
# evaluate predictions
mse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error: %.2f" % (mse))

I will use our model to predict based on submission:

y_pred = model.predict(stest)

Now let's submit:

def subm(y_pred):
    tpre = pd.DataFrame(y_pred, columns=targ_list)
    dpred = pd.DataFrame({"0": tpre["TotalTimeStopped_p20"], "1": tpre["TotalTimeStopped_p50"], "2": tpre["TotalTimeStopped_p80"], "3": tpre["DistanceToFirstStop_p20"], "4": tpre["DistanceToFirstStop_p50"], "5": tpre["DistanceToFirstStop_p80"]})
    subdf['Target'] = dpred.stack().values
    subdf.to_csv('meosub.csv', index=False)

subdf  = pd.read_csv('data/sample_submission.csv') # load submission file
subm(y_pred)

for i in(range(200,2000,200)):
    print(i)