# Traffic Volume Dataset - Sourced from : https://www.kaggle.com/rohith203/traffic-volume-dataset 

## Importing of Packages needed and Data to be used

In [1]:
# Import packages
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn import metrics
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import data
train_df = pd.read_csv('traffic-volume-train.csv')
train_df.sample(5)

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
1250,11/17/12 6:00,0,99,65,2,210,6,6,266.27,0.0,0.0,1,Clear,sky is clear,1381
5101,4/13/13 21:00,0,202,23,3,290,9,9,272.86,0.0,0.0,90,Snow,heavy snow,2999
24569,6/30/16 11:00,0,271,64,1,232,9,9,294.07,0.0,0.0,90,Thunderstorm,thunderstorm,4175
26670,9/16/16 19:00,0,144,46,2,247,2,2,294.75,0.0,0.0,40,Thunderstorm,proximity thunderstorm,3206
29689,1/2/17 14:00,0,154,90,4,270,7,7,272.59,0.0,0.0,90,Snow,light snow,3933


## Preparation of Data

In [3]:
# Preparing data (Training)
train_df['date_time'] = pd.to_datetime(train_df.date_time)

In [4]:
#Splitting Date Time into multiple columns
train_df['year'] = train_df.date_time.dt.year
train_df['month'] = train_df.date_time.dt.month
train_df['day'] = train_df.date_time.dt.day
train_df['hour'] = train_df.date_time.dt.hour

In [5]:
# Removal of rows that cannot be gotten from available Singapore API calls 
# Results in a model that can be appled easily to a Singapore context
train_sg = train_df.drop(['date_time', 'visibility_in_miles', 'rain_p_h', 'snow_p_h', 'weather_type', 'dew_point', 'weather_description' , 'clouds_all' ], axis=1)


In [6]:
# convert the values to Singaporean units
def kelvin_to_celsius(temp_kelvin):
    """Convert Kelvin to Celsius
    
    Return Celsius conversion of input"""
    temp_celsius = temp_kelvin - 273.15
    return temp_celsius

In [7]:
train_sg["temperature_C"] = kelvin_to_celsius(train_sg["temperature"])

In [8]:
train_sg = train_sg.drop(['temperature'], axis=1)
train_sg.head()

Unnamed: 0,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,traffic_volume,year,month,day,hour,temperature_C
0,0,121,89,2,329,5545,2012,10,2,9,15.13
1,0,178,67,3,330,4516,2012,10,2,10,16.21
2,0,113,66,3,329,4767,2012,10,2,11,16.43
3,0,20,66,3,329,5026,2012,10,2,12,16.98
4,0,281,65,3,329,4918,2012,10,2,13,17.99


In [9]:
# Reordering of columns
train_sg = train_sg[['is_holiday', 'air_pollution_index', 'humidity', 'wind_speed', 'wind_direction', 'year', 'month', 'day', 'temperature_C', 'traffic_volume']]
train_sg.head()

Unnamed: 0,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,year,month,day,temperature_C,traffic_volume
0,0,121,89,2,329,2012,10,2,15.13,5545
1,0,178,67,3,330,2012,10,2,16.21,4516
2,0,113,66,3,329,2012,10,2,16.43,4767
3,0,20,66,3,329,2012,10,2,16.98,5026
4,0,281,65,3,329,2012,10,2,17.99,4918


# Baseline Model - Decision Tree

In [10]:
X = train_sg.iloc[:,:9]
X.sample(5)

Unnamed: 0,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,year,month,day,temperature_C
18720,0,272,79,5,298,2015,10,1,9.1
29072,0,30,100,2,355,2016,12,12,-15.82
18461,0,134,73,4,339,2015,9,21,26.65
3399,0,118,85,7,300,2013,2,6,-12.13
30926,0,72,47,1,310,2017,2,10,4.93


In [11]:
y = train_sg["traffic_volume"]

In [12]:
# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [13]:
# initialise Decision Tree
clf = DecisionTreeClassifier(criterion='gini',random_state=0) 

# train model
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [14]:
# get predictions and compare results
predictions = clf.predict(X_test)
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
results_df.sample(10)

Unnamed: 0,Actual,Predicted
18849,1596,1500
19627,3541,299
2330,3071,2537
22789,5521,6807
23488,4608,2672
6673,3270,4479
18297,3025,4746
8734,5021,6172
16709,695,624
25515,4947,5348


In [15]:
# Accuracy
accuracy = accuracy_score(y_test,predictions)
print("Accuracy:",accuracy)

Accuracy: 0.09945679012345679


## Training other models

In [16]:
def train_predict(classifier, X_train, y_train, X_test, y_test): 
    #SVM
    if classifier == SVC():
        classifier =SVC(C=1, kernel='linear', decision_function_shape='ovo', gamma='auto')

    results = {}
    
    # Apply training
    model = classifier.fit(X_train, y_train)
        
    # Get the predictions on the test set,
    y_pred = model.predict(X_test)        
        
    # Compute accuracy on the test set
    results['accuracy'] = accuracy_score(y_test, y_pred)
        
    # Return the results
    return results

In [None]:
modelA = GaussianNB()
# modelB = GradientBoostingClassifier(random_state=1)
# modelC = LogisticRegression()
modelD = SVC()

results = {}
for model in [modelA, modelD]:
    model_name = model.__class__.__name__
    print('Training {}'.format(model_name))
    results[model_name] = {}
    results[model_name] = train_predict(model, X_train, y_train, X_test, y_test)

Training GaussianNB
Training SVC
