# While the entire program can be run, it is recommended that "optional" sections be ignored for cleaner output. 

In [None]:
import pandas as pd
import numpy as np
import warnings
from numpy import loadtxt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_predict
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import KFold

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# warnings.filterwarnings('ignore')

np.random.seed(42)

In [None]:
loop_data = pd.read_csv('SampleTrainingData.csv')
event_data = pd.read_csv('SamplePreraceData.csv')

In [None]:
def grid_search_automation(regressor, params, x, y):
    grid = GridSearchCV(estimator= regressor,
                        param_grid= params,
                        scoring = 'neg_mean_absolute_error',
                        cv = 10,
                        n_jobs = -1)
    grid.fit(x, y)
    print('Best Score:', -grid.best_score_)
    best_params = grid.best_estimator_
    return best_params

In [None]:
# GridSearch parameters

# RF
rf_params = {
    'n_estimators': [200,225,250,275],
    'max_depth': [6],
    'max_features' : ['auto'],
    'max_leaf_nodes': [18,20,22]}

# ET
et_params = {
    'n_estimators':[450,500,550], 
    'max_features': [0.75],
    'max_depth': [9,10,11],
    'min_samples_leaf': [2],
    'max_leaf_nodes': [18,20,22]}    

# GB
gb_params = {'learning_rate': [0.03,0.04], 
              'n_estimators': [175, 200, 225],
              'subsample':[0.9], 
              'max_depth': [3]}

# AdaBoost
ada_params = {'n_estimators':[2,3,4,5],
    'learning_rate':[0.25, 0.4, 0.5]}

# KNN
knn_params = {'n_neighbors':[75]}

# SVM
svm_params = {'epsilon': [1.5, 2, 2.5],
              'max_iter': [2250, 2500, 2750]}

# Experimental Data Preprocessing (Recommended)

In [None]:
# we're dropping drivers' first races of the season, as such data will not have any predictive value.
loop_data = loop_data.drop(loop_data[loop_data['race_number'] <= 1].index)
loop_data = loop_data.reset_index(drop=True)

In [None]:
# we drop drivers with only 5 races total, for the similar reason
values = loop_data['driver'].value_counts().keys().tolist()
counts = loop_data['driver'].value_counts().tolist()

value_counts_df = pd.DataFrame(data=np.column_stack((values,counts)), index=range(len(values)), columns=['values', 'counts'])
value_counts_df['counts'] = value_counts_df['counts'].astype('int32')
value_counts_df = value_counts_df.loc[value_counts_df['counts'] < 3]

In [None]:
drivers_to_drop = value_counts_df['values'] 

index_array = []

for dtd in drivers_to_drop:
    for i, driver in enumerate(loop_data['driver']):
        if driver == dtd:
            index_array.append(i)
            
loop_data = loop_data.drop(index_array)

In [None]:
# we drop instances without past_three_track_rating data 
# loop_data = loop_data.loc[loop_data['past_three_track_rating'] != 0]

# GraphViz (Optional)

In [None]:
%matplotlib inline
data_to_plot = loop_data[['average_humidity', 'average_windspeed', 'fpts', 'previous_event_rating', 'rating_to_date']]
data_to_plot.hist(bins=50, figsize=(10,7.5))
pyplot.show()

In [None]:
attributes = ['fpts', 'start', 'track_total_laps', 'average_windspeed']
pd.plotting.scatter_matrix(loop_data[attributes], figsize=(12,8))

In [None]:
attributes = ['fpts', 'rating_to_date', 'previous_event_rating']
pd.plotting.scatter_matrix(loop_data[attributes], figsize=(12,8))

# train_test_split

In [None]:
x = loop_data[['start', 'track_total_laps', 'track_lap_length','rating_to_date', 'past_three_track_rating', 'average_windspeed']]
# x = loop_data[['start', 'track_total_laps', 'track_lap_length', 'average_windspeed', 'rating_to_date']]
# x = pd.concat([x, loop_data[driver_dummies_columns]], axis=1)
Y = loop_data[['fpts']]

x_train, x_test, Y_train, Y_test = train_test_split(x,Y, test_size=0.1, random_state=42)

In [None]:
# driver_dummies = pd.get_dummies(loop_data['driver'])
# loop_data = pd.concat([loop_data, driver_dummies], axis=1)

# driver_dummies_columns = driver_dummies.columns

# Feature Scaling

In [None]:
scaler = StandardScaler()
scaled_train_data = scaler.fit_transform(x_train)
scaled_test_data = scaler.transform(x_test)

scaled_x_train = pd.DataFrame(scaled_train_data, index=range(len(scaled_train_data)), columns=['start', 'track_total_laps', 'track_lap_length', 'rating_to_date', 'past_three_track_rating', 'average_windspeed'])
scaled_x_test = pd.DataFrame(scaled_test_data, index=range(len(scaled_test_data)), columns=['start', 'track_total_laps', 'track_lap_length', 'rating_to_date', 'past_three_track_rating', 'average_windspeed'])

# Correlation Analysis (Optional)

In [None]:
for_correlation = loop_data[['start', 'track_total_laps', 'track_lap_length', 'average_windspeed', 'rating_to_date', 'previous_event_rating', 'fpts']]

corr_matrix = for_correlation.corr()
corr_matrix['fpts'].sort_values(ascending=False)

# Layer 1 Model Grid Search

In [None]:
rf_best_params = grid_search_automation(RandomForestRegressor(), rf_params, scaled_x_train, Y_train.values.ravel())
et_best_params = grid_search_automation(ExtraTreesRegressor(), et_params, scaled_x_train, Y_train.values.ravel())

In [None]:
svm_best_params = grid_search_automation(LinearSVR(), svm_params, scaled_x_train, Y_train.values.ravel())
ada_best_params = grid_search_automation(AdaBoostRegressor(), ada_params, scaled_x_train, Y_train.values.ravel())
knn_best_params = grid_search_automation(KNeighborsRegressor(), knn_params, scaled_x_train, Y_train.values.ravel())
gb_best_params = grid_search_automation(GradientBoostingRegressor(), gb_params, scaled_x_train, Y_train.values.ravel())

In [None]:
# use the following result for the model parameters. 
# notice that with the way the program is currently written, they must be manually typed in

print(rf_best_params)
print(et_best_params)
print(svm_best_params)
print(ada_best_params)
print(knn_best_params)
print(gb_best_params)

# Layer 1 Models

## Random Forest

In [None]:
rnd_forest = RandomForestRegressor(n_estimators=250, n_jobs=-1, max_leaf_nodes=None, max_depth=6)
rnd_forest.fit(scaled_x_train, Y_train.values.ravel())

In [None]:
rf_train_predictions = rnd_forest.predict(scaled_x_train)
rf_test_predictions = rnd_forest.predict(scaled_x_test)
train_mae = mean_absolute_error(Y_train, rf_train_predictions)
test_mae = mean_absolute_error(Y_test, rf_test_predictions)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)

In [None]:
scores = cross_val_score(rnd_forest, scaled_x_train, Y_train.values.ravel(), scoring="neg_mean_absolute_error", cv=10)
rf_mae_scores = -scores
print('CV Error:', np.mean(rf_mae_scores))

In [None]:
rf_cv_predictions = cross_val_predict(rnd_forest, scaled_x_train, Y_train.values.ravel(), cv=10)


In [None]:
print(len(scaled_x_train))
print(len(loop_data))

## ExtraTrees

In [None]:
extra_trees = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=10,
          max_features=0.75, max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=3, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
          oob_score=False, random_state=None, verbose=0, warm_start=False)
extra_trees.fit(scaled_x_train, Y_train.values.ravel())

In [None]:
et_train_predictions = extra_trees.predict(scaled_x_train)
et_test_predictions = extra_trees.predict(scaled_x_test)
train_mae = mean_absolute_error(Y_train, et_train_predictions)
test_mae = mean_absolute_error(Y_test, et_test_predictions)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)

In [None]:
scores = cross_val_score(extra_trees, scaled_x_train, Y_train.values.ravel(), scoring="neg_mean_absolute_error", cv=10)
et_mae_scores = -scores
print('CV Error:', np.mean(et_mae_scores))

In [None]:
et_cv_predictions = cross_val_predict(extra_trees, scaled_x_train, Y_train.values.ravel(), cv=10)

## Support Vector Machine

In [None]:
svm_reg = LinearSVR(C=1.0, dual=True, epsilon=2, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=2500,
     random_state=None, tol=0.0001, verbose=0)
svm_reg.fit(scaled_x_train, Y_train.values.ravel())

In [None]:
svm_train_predictions = svm_reg.predict(scaled_x_train)
svm_test_predictions = svm_reg.predict(scaled_x_test)
train_mae = mean_absolute_error(Y_train, svm_train_predictions)
test_mae = mean_absolute_error(Y_test, svm_test_predictions)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)

In [None]:
scores = cross_val_score(svm_reg, scaled_x_train, Y_train.values.ravel(), scoring="neg_mean_absolute_error", cv=10)
svm_mae_scores = -scores
print('CV Error:', np.mean(svm_mae_scores))

In [None]:
svm_cv_predictions = cross_val_predict(svm_reg, scaled_x_train, Y_train.values.ravel(), cv=10)

## Adaboost

In [None]:
ada_boost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2), n_estimators=10, learning_rate=0.5, random_state=42)
ada_boost.fit(scaled_x_train, Y_train.values.ravel())

In [None]:
ada_train_predictions = ada_boost.predict(scaled_x_train)
ada_test_predictions = ada_boost.predict(scaled_x_test)
train_mae = mean_absolute_error(Y_train, ada_train_predictions)
test_mae = mean_absolute_error(Y_test, ada_test_predictions)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)

In [None]:
scores = cross_val_score(ada_boost, scaled_x_train, Y_train.values.ravel(), scoring="neg_mean_absolute_error", cv=10)
ada_mae_scores = -scores
print('CV Error:', np.mean(ada_mae_scores))

In [None]:
ada_cv_predictions = cross_val_predict(ada_boost, scaled_x_train, Y_train.values.ravel(), cv=10)

# K Nearest Neighbors

In [None]:
knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=75, p=2,
          weights='uniform')
knn.fit(scaled_x_train, Y_train.values.ravel())

In [None]:
knn_train_predictions = knn.predict(scaled_x_train)
knn_test_predictions = knn.predict(scaled_x_test)
train_mae = mean_absolute_error(Y_train, knn_train_predictions)
test_mae = mean_absolute_error(Y_test, knn_test_predictions)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)

In [None]:
scores = cross_val_score(knn, scaled_x_train, Y_train.values.ravel(), scoring="neg_mean_absolute_error", cv=10)
knn_mae_scores = -scores
print('CV Error:', np.mean(knn_mae_scores))

In [None]:
knn_cv_predictions = cross_val_predict(knn, scaled_x_train, Y_train.values.ravel(), cv=10)

## Gradient Boosting

In [None]:
gb_reg = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.04, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, presort='auto', random_state=None,
             subsample=0.9, verbose=0, warm_start=False)
gb_reg.fit(scaled_x_train, Y_train.values.ravel())

In [None]:
gb_train_predictions = gb_reg.predict(scaled_x_train)
gb_test_predictions = gb_reg.predict(scaled_x_test)
train_mae = mean_absolute_error(Y_train, gb_train_predictions)
test_mae = mean_absolute_error(Y_test, gb_test_predictions)
print('Train MAE:', train_mae)
print('Test MAE:', test_mae)

In [None]:
scores = cross_val_score(gb_reg, scaled_x_train, Y_train.values.ravel(), scoring="neg_mean_absolute_error", cv=10)
gb_mae_scores = -scores
print('CV Error:', np.mean(gb_mae_scores))

In [None]:
gb_cv_predictions = cross_val_predict(gb_reg, scaled_x_train, Y_train.values.ravel(), cv=10)

# Combining Predictions For New Training Set

In [None]:
cv_prediction_df = pd.DataFrame(data=np.column_stack((rf_cv_predictions, et_cv_predictions, svm_cv_predictions, gb_cv_predictions, knn_cv_predictions)), index=range(len(rf_cv_predictions)), columns=['rf', 'et', 'svm', 'gb', 'knn'])
test_prediction_df = pd.DataFrame(data=np.column_stack((rf_test_predictions, et_test_predictions, svm_test_predictions, gb_test_predictions, knn_test_predictions)), index=range(len(rf_test_predictions)), columns=['rf', 'et', 'svm', 'gb', 'knn'])

In [None]:
scaled_x_train = pd.concat([scaled_x_train, cv_prediction_df], axis = 1)
scaled_x_test = pd.concat([scaled_x_test, test_prediction_df], axis = 1)

# Layer 2 

# GridSearchCV (Run if you don't know your optimal hyperparameters)

In [None]:
param_grid = [
    {'n_estimators': [500,750,1000,1100,1200],
     'learning_rate': [0.011, 0.012, 0.013],
     'subsample': [0.7,0.9],
     'max_depths': [6]},
]

In [None]:
boost_model = XGBRegressor(booster='gbtree', objective ='reg:squarederror', n_jobs=-1)
grid_search = GridSearchCV(boost_model, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(scaled_x_train, Y_train)

In [None]:
grid_search.best_params_

# Training

In [None]:
my_model = XGBRegressor(booster='gbtree', verbosity=0, n_estimators= 500, learning_rate = 0.011, subsample = 0.9, max_depths= 2, n_jobs=-1, colsample_bytree=0.8, gamma=1, objective ='reg:squarederror')
my_model.fit(scaled_x_train, Y_train)

# Feature Importance (Optional)

In [None]:
# features = x_train.columns
# feature_importances = my_model.feature_importances_
# features_index = range(len(features))
# features_df = pd.DataFrame(np.nan, index= features_index, columns = ['features', 'feature_importances'])
# features_df['features'] = features
# features_df['feature_importances'] = feature_importances

# features_df.sort_values(by=['feature_importances'], ascending=False)
# print(features_df)

pyplot.rcParams["figure.figsize"] = (20, 10)
plot_importance(my_model)
pyplot.show()

# Training Error

In [None]:
train_results = my_model.predict(scaled_x_train)
train_mae = mean_absolute_error(Y_train, train_results)
print('Train Error: ' + str(train_mae))

# Cross-Validation

In [None]:
scores = cross_val_score(my_model, scaled_x_train, Y_train, scoring="neg_mean_absolute_error", cv=10)
tree_mae_scores = -scores
print('CV Error:', np.mean(tree_mae_scores))

# Testing

In [None]:
test_predictions = my_model.predict(scaled_x_test)
test_mae_score = mean_absolute_error(Y_test, test_predictions)
print('Test Error:', str(test_mae_score))


In [None]:
correlation = scaled_x_test[['rf', 'et', 'svm', 'gb', 'knn']]

corr_matrix = correlation.corr()
corr_matrix['rf'].sort_values(ascending=False)

# Predictions

In [None]:
x_event = event_data[['start', 'track_total_laps', 'rating_to_date', 'past_three_track_rating', 'average_windspeed']]
# x_event = pd.concat([x_event, event_data[driver_dummies_columns]], axis=1)

print(x.columns)
print(x_event.columns)

In [None]:
scaler = StandardScaler()
scaled_event_data = scaler.fit_transform(x_event)

scaled_x_event = pd.DataFrame(scaled_event_data, index=range(len(scaled_event_data)), columns=['start', 'track_total_laps', 'rating_to_date', 'past_three_track_rating', 'average_windspeed'])


In [None]:
rf_event_predictions = rnd_forest.predict(scaled_x_event)
et_event_predictions = extra_trees.predict(scaled_x_event)
svm_event_predictions = svm_reg.predict(scaled_x_event)
ada_event_predictions = ada_boost.predict(scaled_x_event)
gb_event_predictions = gb_reg.predict(scaled_x_event)

In [None]:
event_prediction_df = pd.DataFrame(data=np.column_stack((rf_event_predictions, et_event_predictions, svm_event_predictions, ada_event_predictions, gb_event_predictions)), index=range(len(rf_event_predictions)), columns=['rf', 'et', 'svm', 'ada', 'gb'])

In [None]:
scaled_x_event = pd.concat([scaled_x_event, event_prediction_df], axis = 1)

In [None]:
event_predictions = my_model.predict(scaled_x_event)
event_predictions_draft= x_event.copy()
event_predictions_draft['fpts'] = event_predictions
event_predictions_draft['driver'] = event_data['driver']
event_predictions_complete = event_predictions_draft[['start', 'fpts', 'driver']]
event_predictions_complete