In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as cs
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
trafficVolum = pd.read_csv("traffic_flow_data.csv",index_col=0)
#trafficVolum.describe()
target_data = trafficVolum.loc[:,"Segment23_(t+1)"] 
len(trafficVolum.columns)
#world cup attributes
w_features=trafficVolum.iloc[:,np.arange(449)].copy()
#world cup goal result
w_target=trafficVolum.iloc[:,449].copy()


In [3]:
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames in this wise manner yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

def feature_process(features):

    w_features_num = features;

    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(list(w_features_num))),
            ('imputer', Imputer(strategy="median")),
            ('std_scaler', StandardScaler(with_mean = False)),
        ])

    full_pipeline = num_pipeline
    return full_pipeline

In [4]:
full_pipeline = feature_process(w_features)

feature_prepared = pd.DataFrame(data=full_pipeline.fit_transform(w_features),index=np.arange(1,len(w_features) + 1))

train_data = feature_prepared[:int(0.8 * len(feature_prepared))]
test_data = feature_prepared[int(0.8 * len(feature_prepared)):]
train_results = w_target[:int(0.8 * len(w_target))]
test_results = w_target[int(0.8 * len(w_target)):]

In [17]:
# Decision Trees For Regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


params = {'max_leaf_nodes': list(range(95,100)), 'min_samples_split': [2,3,4,5]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1)
grid_search_cv.fit(train_data, train_results)
print grid_search_cv.best_estimator_

T_predict = grid_search_cv.predict(test_data)
W_predict = grid_search_cv.predict(train_data)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.5min finished


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=95,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')


In [18]:
print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_results, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_results, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_results, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_results, W_predict))

 
_________________###################____________________
Mean squared error for testing data: 2018.72
Variance score for testing data: 0.94
******************************************************* 
Mean squared error for training data: 2813.66
Variance score for training data: 0.93


In [13]:
# Linear Ridge Regression
from sklearn.linear_model import Ridge

model = Ridge(alpha=1.0)
model.fit(train_data, train_results)
T_predict = model.predict(test_data)
W_predict = model.predict(train_data)
print model

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [14]:
print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_results, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_results, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_results, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_results, W_predict))

 
_________________###################____________________
Mean squared error for testing data: 834.91
Variance score for testing data: 0.98
******************************************************* 
Mean squared error for training data: 1127.08
Variance score for training data: 0.97


In [15]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression

model = LinearRegression(n_jobs = None)
model.fit(train_data, train_results)
T_predict = model.predict(test_data)
W_predict = model.predict(train_data)
print model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


In [16]:
print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_results, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_results, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_results, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_results, W_predict))

 
_________________###################____________________
Mean squared error for testing data: 843.81
Variance score for testing data: 0.98
******************************************************* 
Mean squared error for training data: 1123.80
Variance score for training data: 0.97
