# M6 - WEEK 6 | PROJECT: Kaggle Competition - Predict Traffic Congestion

>"Our task here is to predict traffic congestion based on aggregate measures of  stopping distance and waiting times at intersections in 4 major US cities."

## Goal

 -   process data
 -  run model
 - submit predictions

### Process data

We start by importing all useful modules:

In [1]:
import pandas as pd # import pandas
import numpy as np # import numpy

# encoding and splitting
from sklearn import preprocessing # preprocessing for Label and OneHotEncode
from sklearn.model_selection import train_test_split #for train test split

# models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor


## other necessary models
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor

## import Gridsearch for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# used metrics
from sklearn.metrics import mean_squared_error # import mean squared error as we are suppose to use rmse

#silence warning
import warnings
warnings.filterwarnings("ignore") 

We will start by importing our datasets:

In [2]:
train = pd.read_csv('data/train.csv') # load train
test = pd.read_csv('data/test.csv') #load test
subdf  = pd.read_csv('data/sample_submission.csv') # load submission file

Then we define a function to process data:

In [3]:
def dataproc(df):
    _df = df
    
    # 'IntersectionId', 'EntryStreetName', 'ExitStreetName', 'Path',   --- removed as latitutde and longitude gives location and heading gives direction
    feat_list = ['Latitude', 'Longitude', 'EntryHeading', 'ExitHeading', 'Hour', 'Weekend', 'Month', 'City']
    num_feat_list = ['Latitude', 'Longitude', 'Hour', 'Weekend', 'Month']
    cat_feat_list = ['EntryHeading', 'ExitHeading', 'City']
    
    
    X = _df[feat_list]
        
    #select cat
    X1 = X.select_dtypes(include=[object])

    #LabelEncode
    # instantiate
    le = preprocessing.LabelEncoder()

    # fit and transform
    X2 = X1.apply(le.fit_transform)

    #OneHotEncode
    # instantiate
    enc = preprocessing.OneHotEncoder()

    # fit and transform
    enc.fit(X2)
    X3 = enc.transform(X2).toarray()

    X3p = pd.DataFrame(X3, columns = ["Cat_"+str(int(i)) for i in range(X3.shape[1])])

    Xx = pd.concat([X, X3p], axis=1) # merge new features with old
    X = Xx.drop(columns=cat_feat_list, axis=1) # drop unecessary features
    
    return X

In [13]:
X = dataproc(train) # train processed dataset

# list of targets
targ_list = ['TotalTimeStopped_p20', 'TotalTimeStopped_p40', 'TotalTimeStopped_p50', 'TotalTimeStopped_p60', 'TotalTimeStopped_p80', 'DistanceToFirstStop_p20', 'DistanceToFirstStop_p40', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p60', 'DistanceToFirstStop_p80']

#y = train[targ_list] #target dataset will all targets, non submitted included
y = train[["TotalTimeStopped_p20", "TotalTimeStopped_p50", "TotalTimeStopped_p80", "DistanceToFirstStop_p20", "DistanceToFirstStop_p50", "DistanceToFirstStop_p80"]]
stest = dataproc(test) # processed test submission data

To train our model let's split our train dataset

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) #, train_size=100000, test_size=5000

We list all potentially eligible models:

In [28]:
models = {
    'knn': KNeighborsRegressor(),
    'linear regression': LinearRegression(),
    'lasso': Lasso(),
    'ridge':  RidgeCV(),
    'elasticNet': ElasticNet(),
    'random forest': RandomForestRegressor(),
    'decision tree':DecisionTreeRegressor(),
    'extra-trees': ExtraTreesRegressor(),
    'sdg regressor': SGDRegressor(),
    'ada boost regressor': AdaBoostRegressor(),
    'gradient boosting regressor': GradientBoostingRegressor(),
    'extreme boosting regressor': XGBRegressor(**{'objective':'reg:squarederror'})
}

Let's benchmark and see

In [29]:
rmse_matrix = pd.DataFrame(index=y.columns, columns=models.keys())

for col in y.columns:
        
    y_pred = dict()
    mse = dict()
    
    for key, model in models.items():     
        model.fit(X_train, y_train[col])                    
        y_pred[key] = model.predict(X_test)   
        rmse_matrix.at[col,key]= np.sqrt(mean_squared_error(y_test[col], model.predict(X_test)))

rmse_matrix

Unnamed: 0,knn,linear regression,lasso,ridge,elasticNet,random forest,decision tree,extra-trees,sdg regressor,ada boost regressor,gradient boosting regressor,extreme boosting regressor
TotalTimeStopped_p20,7.45042,7.10062,7.11888,7.10061,7.11745,6.09291,7.75702,6.83125,129191.0,9.08077,6.93449,6.93495
TotalTimeStopped_p50,15.538,15.3267,15.4608,15.3267,15.4485,10.3369,13.0754,11.6222,163863.0,17.3515,14.6608,14.6594
TotalTimeStopped_p80,27.5124,27.4911,27.7419,27.4911,27.7688,17.6893,22.5599,19.6159,14908.2,35.8628,26.1302,26.1413
DistanceToFirstStop_p20,30.2544,28.5857,28.6499,28.5857,28.6493,26.4791,32.8297,28.097,35350.3,42.2383,28.0022,28.0016
DistanceToFirstStop_p50,74.066,70.2786,70.5311,70.2783,70.571,52.16,66.133,55.5498,242792.0,85.9852,67.9601,67.8476
DistanceToFirstStop_p80,157.804,150.745,151.149,150.744,151.347,88.5512,109.501,95.5399,82639.9,223.667,140.253,140.279


In [32]:
for i in models.keys():
    print(rmse_matrix[i].mean())

52.10425165270863
49.92131525904475
50.10858346222445
49.9211043168476
50.150252925505164
33.55156064951603
41.975954189421245
36.20933219050844
111457.5486129246
69.03088428580021
47.32338819447214
47.310564516058456


In [39]:
from sklearn.model_selection import RandomizedSearchCV

model = ExtraTreesRegressor()
param_grid={'n_estimators': range(200,2001,200), 'max_features': range(5,9)}                            
g = RandomizedSearchCV(estimator=model, param_distributions = param_grid, scoring='neg_mean_squared_error', cv=3, n_iter = 100)

fit_grid = g.fit(X_train, y_train)

print("Best: %f using %s" % (fit_grid.best_score_, fit_grid.best_params_))

for test_mean, train_mean, param in zip(
        fit_grid.cv_results_['mean_test_score'],
        fit_grid.cv_results_['mean_train_score'],
        fit_grid.cv_results_['params']):
    print("Train: %f // Test : %f with: %r" % (train_mean, test_mean, param))
    
#model = ExtraTreesRegressor(**fit_grid.best_params_)

#model.fit(train, y_train)

#df_sub = pd.DataFrame({'ID': id_test, 'y': model.predict(test)})
#df_sub.to_csv('mercedes-submission.csv', index=False)

y_pred = model.predict(X_test)
predictions = y_pred
# evaluate predictions
mse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error: %.2f" % (mse))

KeyboardInterrupt: 

In [33]:
model = ExtraTreesRegressor()
param_grid={
        #'n_estimators': range(50,126,25),
        'max_features': range(0,25,3),
        #'min_samples_leaf': range(20,50,5),
        #'min_samples_split': range(15,36,5),
    }                            
g = GridSearchCV(estimator=model, param_grid = param_grid, scoring='neg_mean_squared_error', cv=5)

fit_grid = g.fit(X_train, y_train)

print("Best: %f using %s" % (fit_grid.best_score_, fit_grid.best_params_))

for test_mean, train_mean, param in zip(
        fit_grid.cv_results_['mean_test_score'],
        fit_grid.cv_results_['mean_train_score'],
        fit_grid.cv_results_['params']):
    print("Train: %f // Test : %f with: %r" % (train_mean, test_mean, param))
    
#model = ExtraTreesRegressor(**fit_grid.best_params_)

#model.fit(train, y_train)

#df_sub = pd.DataFrame({'ID': id_test, 'y': model.predict(test)})
#df_sub.to_csv('mercedes-submission.csv', index=False)

y_pred = model.predict(X_test)
predictions = y_pred
# evaluate predictions
mse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error: %.2f" % (mse))

ValueError: max_features must be in (0, n_features]

I will use our model to predict based on submission:

In [28]:
y_pred = model.predict(stest)

Now let's submit:

In [33]:
def subm(y_pred):
    tpre = pd.DataFrame(y_pred, columns=targ_list)
    dpred = pd.DataFrame({"0": tpre["TotalTimeStopped_p20"], "1": tpre["TotalTimeStopped_p50"], "2": tpre["TotalTimeStopped_p80"], "3": tpre["DistanceToFirstStop_p20"], "4": tpre["DistanceToFirstStop_p50"], "5": tpre["DistanceToFirstStop_p80"]})
    subdf['Target'] = dpred.stack().values
    subdf.to_csv('meosub.csv', index=False)

In [34]:
subdf  = pd.read_csv('data/sample_submission.csv') # load submission file
subm(y_pred)

In [37]:
for i in(range(200,2000,200)):
    print(i)

200
400
600
800
1000
1200
1400
1600
1800
