In [219]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from  sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import pickle   
import datetime
from sklearn.externals import joblib
import xgboost as xgb
import warnings
from sklearn import svm

warnings.filterwarnings("ignore")

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style('whitegrid')
%matplotlib inline

con = sqlite3.connect('/home/Project_DS/Database/Sports_Results.db')
df = pd.read_sql_query("select * from ThroneHistorical;", con)
to_pred = pd.read_sql_query("select * from ThronePrediction;", con)
id_pred = to_pred.id
date_pred = to_pred.date

# Select current season and previous season
# Computate the season
now = datetime.datetime.now()

if now.month < 8:
    season_current = now.year - 1
    season_previous = now.year
if now.month > 8:
    season_current = now.year

season_previous = now.year - 1

df_previous = df[df['year'].astype(int) < 2017]

df['year'] = df['year'].astype(int)
df = df[df['year'].astype(int) >= season_previous].reset_index(drop = True) 


def preprocess_date(data_input, data_to_pred_input):
    
    data_to_pred = data_to_pred_input.copy()
    # Change to datetime and extract the hour
    data_input['date'] = pd.to_datetime(data_input['date'])
    data_input['hour'] = data_input['date'].apply(lambda x: x.hour)
    data_to_pred['date'] = pd.to_datetime(data_to_pred['date'])
    data_to_pred['hour'] = data_to_pred['date'].apply(lambda x: x.hour)
    
    # Drop datetime and date
    data_input.drop(['id','date','competition'], axis = 1, inplace = True)
    data_to_pred.drop(['id','competition','team_1_prob','team_2_prob','team_tie_prob','date','confidence'], axis = 1, inplace = True)
    
    # Label encoding
    to_stack = df[['team_1_name', 'team_2_name']]
    to_stack = pd.DataFrame(to_stack.stack()).reset_index(drop = True)[0]

    le = preprocessing.LabelEncoder()
    le.fit(to_stack)
    data_input['team_1_name'] = le.transform(data_input['team_1_name']) 
    data_input['team_2_name'] = le.transform(data_input['team_2_name'])
    data_input['day'] = data_input['day'].astype(int)
    data_input['month'] = data_input['month'].astype(int)

    data_to_pred['team_1_name'] = le.transform(data_to_pred['team_1_name']) 
    data_to_pred['team_2_name'] = le.transform(data_to_pred['team_2_name'])
    data_to_pred['day'] = data_to_pred['day'].astype(int)
    data_to_pred['month'] = data_to_pred['month'].astype(int)
    
    return data_input, data_to_pred

def fezture_to_predict(row):
    if row['team_1_score'] > row['team_2_score']:
        return 1
    if row['team_2_score'] > row['team_1_score']:
        return 2
    if row['team_1_score'] == row['team_2_score']:
        return 0

def aggreg_previous(data_input):
    data_input = data_input[data_input['year'] == 2016]
    
    return data_input

df, to_pred_ = preprocess_date(df, to_pred)
df['To_Predict'] = df.apply (lambda row: fezture_to_predict(row), axis=1)
df.drop(['team_1_score','team_2_score'], inplace = True, axis =1 )
df_previous['Result'] = df_previous.apply (lambda row: fezture_to_predict(row), axis=1)

X =  df.drop('To_Predict', axis = 1)
y =  df['To_Predict']

# I. Repartition for all classes

In [208]:
# Repartition of class
#fig, (axis1) = plt.subplots(1,1,figsize=(25,8))
#sns.countplot(x = df['To_Predict'], order=[0,1,2], ax=axis1)
#plt.title('Reapartition of classes 2017')
#plt.show()

In [209]:
# Repartition of class
#fig, (axis1) = plt.subplots(1,1,figsize=(25,8))
#sns.countplot(x = df_previous['Result'], order=[0,1,2], ax=axis1)
#plt.title('Reapartition of classes 2016')
#plt.show()

# II. Model

In [235]:
def model(clf, name):   
    
    clf.fit(X, y)
    print(clf.best_params_ )
    print("------------------")
    print(clf.best_score_)
    with open(name + '.pkl', 'wb') as fh:                                             
        pickle.dump(clf, fh)
    
def prediction(data_to_pred, filename):
    loaded_model = joblib.load(filename)
    result = loaded_model.predict_proba(data_to_pred)

    return result

In [None]:
param_grid = {"max_depth": [1,2,3, 4, 5, 8, None],
              "n_estimators": [50, 75, 100, 150],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

RF = GridSearchCV(RandomForestClassifier(n_jobs = -1), cv = 3, param_grid = param_grid, verbose = 0, n_jobs = -1, scoring = 'accuracy')

model(RF, 'RF')

In [169]:
param_grid = {"penalty": ["l2"],
              "C": [0.01, 0.1, 1, 10, 50, 75, 100, 1000],
              "max_iter": [25, 50, 75, 100, 200, 500],
              "solver": ['newton-cg', 'lbfgs', 'sag', 'saga'],
              "multi_class": ['ovr', 'multinomial']}

LR = GridSearchCV(LogisticRegression(), cv = 3, param_grid = param_grid, verbose = 0, n_jobs = -1, scoring = 'accuracy')

model(LR, 'LR')

{'C': 50, 'penalty': 'l2', 'max_iter': 50, 'multi_class': 'ovr', 'solver': 'lbfgs'}
------------------
0.5672823219


In [203]:
sgd = SGDClassifier()

param_grid = {'penalty':['l1', 'l2','elasticnet'],
             'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
             'learning_rate': ['optimal'],
             'max_iter':[10000],
             'l1_ratio': [0.15, 0.25, 0.5, 0.75, 0.85]}


SGD = GridSearchCV(sgd,  cv = 3, param_grid = param_grid, verbose = 0, n_jobs = -1, scoring = 'accuracy')

model(SGD, 'SGD')

{'l1_ratio': 0.75, 'penalty': 'l1', 'max_iter': 10000, 'learning_rate': 'optimal', 'loss': 'log'}
------------------
0.560686015831


In [206]:
svc = svm.SVC(probability = True)

param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
              'gamma': [1e-3, 1e-4, 1e-5, 1e-6],
               'C': [2000, 2500, 3000, 3500],
             'degree': [3]}

SVC = GridSearchCV(svc,  cv = 3, param_grid = param_grid, verbose = 0, n_jobs = -1, scoring = 'accuracy')

model(SVC, 'SVC')

{'C': 3000, 'degree': 3, 'kernel': 'rbf', 'gamma': 1e-06}
------------------
0.552770448549


In [243]:
#print(prediction(to_pred_, 'RF.pkl'))
#print(prediction(to_pred_, 'LR.pkl'))
#print(prediction(to_pred_, 'SGD.pkl'))
#print(prediction(to_pred_, 'SVC.pkl'))

# III. Prediction and submission

In [242]:
to_pred['team_tie_prob']  = prediction(to_pred_, 'RF.pkl')[:,0]
to_pred['team_1_prob']  = prediction(to_pred_, 'RF.pkl')[:,1]
to_pred['team_2_prob']  = prediction(to_pred_, 'RF.pkl')[:,2]

In [241]:
import peyton

throne = peyton.Throne(username='JulienHeiduk', token="bcebc6d7-f224-4940-a816-74e3e6d4c34a")

# Submit predictions 
throne.competition('Italian Serie A').submit(to_pred)

{'success': True}