In [178]:
import requests
import json
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression,BayesianRidge
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVR
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


%matplotlib inline

with open('player_details.json') as json_data:
    d = json.load(json_data)
history = json_normalize(d, 'history', ['player_id'])
past = json_normalize(d, 'history_past', ['player_id'])
data = pd.read_json('players.json',orient='columns')
fixtures =  json_normalize(d, 'fixtures', ['player_id'])
teams = pd.read_json('teams.json',orient='columns')

player_type = pd.read_json('game-settings.json',orient='index')
player_type['element_type'] = player_type.index

In [106]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [179]:
#Generic function for making a classification model and accessing performance:
def classification_model(model, data, X, y):
  #Make predictions on training set:
  predictions = []
  model.fit(X,y)
  predictions = model.predict(X)
  accuracy = metrics.accuracy_score(predictions,y)
  print ("Accuracy : %s" % "{0:.3%}".format(accuracy))
  
  #Perform k-fold cross-validation with 5 folds
  kf = KFold(data.shape[0],n_folds=5)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (X.iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = y.iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(X.iloc[test,:], y.iloc[test]))
 
  print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  #Fit the model again so that it can be refered outside the function:
  model.fit(X,y) 

In [180]:
var_mod = ['photo','web_name','status','first_name','second_name','in_dreamteam','special','news']
le = LabelEncoder()
for i in var_mod:
    data[i] = le.fit_transform(data[i])

data['chance_of_playing_next_round'].fillna(100.0,inplace=True)
data['chance_of_playing_this_round'].fillna(100.0,inplace=True)
data.fillna(0.0,inplace=True)

team_avg = data.query('total_points>0')
team_avg = team_avg[['team','form']]
team_avg = team_avg.groupby(['team'],as_index=False).mean()
team_avg.columns = ['team','avg_team_form']
data = data.merge(team_avg,how='left',on='team')

In [181]:
#past.columns
past = past[['assists','bonus','bps','creativity','end_cost','goals_conceded','goals_scored','influence','minutes','own_goals',
           'penalties_missed','penalties_saved','red_cards','saves','season','start_cost','threat','total_points','yellow_cards',
           'player_id']]
past = past.query('season == 11')
past.columns = ['past_assists','past_bonus','past_bps','past_creativity','past_end_cost','past_goals_conceded','past_goals_scored',
                'past_influence','past_minutes','past_own_goals','past_penalties_missed','past_penalties_saved','past_red_cards',
                'past_saves','past_season','past_start_cost','past_threat','past_total_points','past_yellow_cards','id']
le = LabelEncoder()
past['id'] = le.fit_transform(past['id'])
past['past_creativity'] = le.fit_transform(past['past_creativity'])
past['past_influence'] = le.fit_transform(past['past_influence'])
past['past_threat'] = le.fit_transform(past['past_threat'])
data = data.merge(past,how='left',on='id')
#data.head()

In [182]:
player_type.columns = ['et_bps_clean_sheets','et_bps_goals_scored','et_scoring_clean_sheets','et_scoring_goals_conceded','et_scoring_goals_scored','et_squad_max_play',
             'et_squad_min_play','et_squad_select','et_sub_positions_locked','et_ui_shirt_specific','element_type']

player_t=player_type[['et_bps_clean_sheets','et_bps_goals_scored','et_scoring_clean_sheets','et_scoring_goals_conceded','et_scoring_goals_scored','et_squad_max_play',
             'et_squad_min_play','et_squad_select','element_type' ]]

data = data.merge(player_t,how='left',on='element_type')

In [183]:
teams.columns = ['team_code', 'current_event_fixture','draw','form','team'
                          ,'link_url','loss','team_name','next_event_fixture','played','points'
                          ,'position','short_team_name','strength','strength_attack_away',
                          'strength_attack_home','strength_defence_away','strength_defence_home','strength_overall_away',
                      'strength_overall_home','team_division','unavailable','win']

teams_n=teams[['team','strength','strength_attack_away','strength_defence_away','strength_overall_away',
               'strength_attack_home','strength_defence_home','strength_overall_home','short_team_name' ]]
le = LabelEncoder()
val_col = ['short_team_name']
for i in val_col:
    teams_n[i] = le.fit_transform(teams_n[i])
data = data.merge(teams_n,how='left',on='team')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [184]:
current_fix = history.query('round==7')
current_fix.columns = ['curr_assists', 'curr_attempted_passes', 'curr_big_chances_created',
           'curr_big_chances_missed', 'curr_bonus', 'curr_bps', 'curr_clean_sheets',
           'curr_clearances_blocks_interceptions', 'curr_completed_passes', 'curr_creativity',
           'curr_dribbles', 'curr_ea_index', 'curr_element', 'curr_errors_leading_to_goal',
           'curr_errors_leading_to_goal_attempt', 'curr_fixture', 'curr_fouls', 'curr_goals_conceded',
           'curr_goals_scored', 'curr_ict_index', 'curr_id', 'curr_influence', 'curr_key_passes',
           'curr_kickoff_time', 'curr_kickoff_time_formatted', 'curr_loaned_in', 'curr_loaned_out',
           'curr_minutes', 'curr_offside', 'curr_open_play_crosses', 'curr_opponent_team', 'curr_own_goals',
           'curr_penalties_conceded', 'curr_penalties_missed', 'curr_penalties_saved',
           'curr_recoveries', 'curr_red_cards', 'curr_round', 'curr_saves', 'curr_selected', 'curr_tackled',
           'curr_tackles', 'curr_target_missed', 'curr_team_a_score', 'curr_team_h_score', 'curr_threat',
           'curr_total_points', 'curr_transfers_balance', 'curr_transfers_in', 'curr_transfers_out',
           'curr_value', 'curr_was_home', 'curr_winning_goals', 'curr_yellow_cards', 'id']
current_fix=current_fix[[
           'curr_opponent_team', 'curr_was_home', 'id']]

le = LabelEncoder()
current_fix['id'] = le.fit_transform(current_fix['id'])
current_fix =current_fix[current_fix.id != 0]
current_fix = current_fix.merge(data[['id','team']],how='left',on='id')
team_diff = current_fix.merge(teams_n,how='left',on=['team'])
team_diff.columns = ['curr_opponent_team', 'curr_was_home', 'id','team','curr_strength','curr_strength_attack_away',
                          'curr_strength_attack_home','curr_strength_defence_away','curr_strength_defence_home','curr_strength_overall_away',
                          'curr_strength_overall_home','curr_short_team_name']
team_diff = team_diff.merge(teams_n,how='left',left_on=['curr_opponent_team'], right_on=['team'])
team_diff.columns = ['curr_opponent_team', 'curr_was_home', 'id','team','team_strength','curr_strength_attack_away',
                          'curr_strength_attack_home','curr_strength_defence_away','curr_strength_defence_home','curr_strength_overall_away',
                          'curr_strength_overall_home','curr_short_team_name','opp_team',
                          'opp_strength','curr_opp_strength_attack_away','curr_opp_strength_attack_home',
                          'curr_opp_strength_defence_away','curr_opp_strength_defence_home','curr_opp_strength_overall_away',
                          'curr_opp_strength_overall_home','curr_opp_short_team_name']
team_diff['strength_diff'] = team_diff['team_strength']-team_diff['opp_strength']
team_diff['strength_attack_diff'] = np.where(team_diff['curr_was_home']==True, team_diff['curr_strength_attack_home']-team_diff['curr_opp_strength_attack_away'], team_diff['curr_strength_attack_away']-team_diff['curr_opp_strength_attack_home'])
team_diff['strength_defence_diff'] = np.where(team_diff['curr_was_home']==True, team_diff['curr_strength_defence_home']-team_diff['curr_opp_strength_defence_away'], team_diff['curr_strength_defence_away']-team_diff['curr_opp_strength_defence_home'])
team_diff['strength_overall_diff'] = np.where(team_diff['curr_was_home']==True, team_diff['curr_strength_overall_home']-team_diff['curr_opp_strength_overall_away'], team_diff['curr_strength_overall_away']-team_diff['curr_opp_strength_overall_home'])
team_diff = team_diff[['curr_opponent_team', 'curr_was_home', 'id','strength_diff','strength_attack_diff',
                       'strength_defence_diff','strength_overall_diff']]
data = data.merge(team_diff,how='left',on='id')

In [185]:
#Past Fixture   
past_fix_avg = history.query('round>=1 & round <=6')
past_fix_avg.columns = ['prev_assists3', 'prev_attempted_passes3', 'prev_big_chances_created3',
       'prev_big_chances_missed3', 'prev_bonus3', 'prev_bps3', 'prev_clean_sheets3',
       'prev_clearances_blocks_interceptions3', 'prev_completed_passes3', 'prev_creativity3',
       'prev_dribbles3', 'prev_ea_index3', 'prev_element3', 'prev_errors_leading_to_goal3',
       'prev_errors_leading_to_goal_attempt3', 'prev_fixture3', 'prev_fouls3', 'prev_goals_conceded3',
       'prev_goals_scored3', 'prev_ict_index3', 'prev_id3', 'prev_influence3', 'prev_key_passes3',
       'prev_kickoff_time3', 'prev_kickoff_time_formatted3', 'prev_loaned_in3', 'prev_loaned_out3',
       'prev_minutes3', 'prev_offside3', 'prev_open_play_crosses3', 'prev_opponent_team3', 'prev_own_goals3',
       'prev_penalties_conceded3', 'prev_penalties_missed3', 'prev_penalties_saved3',
       'prev_recoveries3', 'prev_red_cards3', 'prev_round3', 'prev_saves3', 'prev_selected3', 'prev_tackled3',
       'prev_tackles3', 'prev_target_missed3', 'prev_team_a_score3', 'prev_team_h_score3', 'prev_threat3',
       'prev_total_points3', 'prev_transfers_balance3', 'prev_transfers_in3', 'prev_transfers_out3',
       'prev_value3', 'prev_was_home3', 'prev_winning_goals3', 'prev_yellow_cards3', 'id']
past_fix_avg=past_fix_avg[['prev_assists3', 'prev_attempted_passes3', 'prev_big_chances_created3',
       'prev_big_chances_missed3', 'prev_bonus3', 'prev_bps3', 'prev_clean_sheets3',
       'prev_clearances_blocks_interceptions3', 'prev_completed_passes3', 'prev_creativity3',
       'prev_dribbles3', 'prev_ea_index3', 'prev_element3', 'prev_errors_leading_to_goal3',
       'prev_errors_leading_to_goal_attempt3',  'prev_fouls3', 'prev_goals_conceded3',
       'prev_goals_scored3', 'prev_ict_index3',  'prev_influence3', 'prev_key_passes3',
       'prev_kickoff_time_formatted3', 
       'prev_minutes3', 'prev_offside3', 'prev_open_play_crosses3', 'prev_opponent_team3', 'prev_own_goals3',
       'prev_penalties_conceded3', 'prev_penalties_missed3', 'prev_penalties_saved3',
       'prev_recoveries3', 'prev_red_cards3', 'prev_round3', 'prev_saves3',  'prev_tackled3',
       'prev_tackles3', 'prev_target_missed3', 'prev_team_a_score3', 'prev_team_h_score3', 'prev_threat3',
       'prev_total_points3',
       'prev_value3', 'prev_was_home3', 'prev_winning_goals3', 'prev_yellow_cards3', 'id']]
var_col = ['prev_kickoff_time_formatted3','prev_was_home3','prev_threat3', 'prev_influence3','prev_creativity3','prev_ict_index3']
le = LabelEncoder()
for i in var_col:
    past_fix_avg[i] = le.fit_transform(past_fix_avg[i])
#past_fix_avg = past_fix_avg.merge(teams_n,how='left',left_on=['prev_opponent_team3'], right_on=['team'])
past_fix_avg= past_fix_avg.groupby(['id'], as_index=False).mean()
data = data.merge(past_fix_avg,how='left',on='id')

In [186]:
outcome_var = 'event_points'
predictor_var_all = [
'chance_of_playing_this_round', 'status','element_type',

'et_bps_clean_sheets','et_bps_goals_scored','et_scoring_clean_sheets',
'et_scoring_goals_conceded','et_scoring_goals_scored','et_squad_max_play','et_squad_min_play','et_squad_select',

'past_assists','past_bonus','past_bps','past_creativity','past_end_cost','past_goals_conceded','past_goals_scored',
'past_influence','past_minutes','past_own_goals','past_penalties_missed','past_penalties_saved','past_red_cards',
'past_saves','past_season','past_start_cost','past_threat','past_total_points','past_yellow_cards',

'team','strength','strength_attack_away',
'strength_attack_home','strength_defence_away','strength_defence_home','strength_overall_away',
'strength_overall_home','short_team_name','avg_team_form',

'curr_opponent_team', 'curr_was_home','strength_diff','strength_attack_diff',
'strength_defence_diff','strength_overall_diff',

'prev_assists3', 'prev_attempted_passes3', 'prev_big_chances_created3',
'prev_big_chances_missed3', 'prev_bonus3', 'prev_bps3', 'prev_clean_sheets3',
'prev_clearances_blocks_interceptions3', 'prev_completed_passes3', 
'prev_dribbles3', 'prev_ea_index3', 'prev_element3', 'prev_errors_leading_to_goal3',
'prev_errors_leading_to_goal_attempt3',  'prev_fouls3', 'prev_goals_conceded3',
'prev_goals_scored3', 'prev_key_passes3',
'prev_ict_index3',  'prev_influence3', 'prev_creativity3','prev_threat3',
'prev_kickoff_time_formatted3', 
'prev_minutes3', 'prev_offside3', 'prev_open_play_crosses3', 'prev_opponent_team3', 'prev_own_goals3',
'prev_penalties_conceded3', 'prev_penalties_missed3', 'prev_penalties_saved3',
'prev_recoveries3', 'prev_red_cards3', 'prev_round3', 'prev_saves3',  'prev_tackled3',
'prev_tackles3', 'prev_target_missed3', 'prev_team_a_score3', 'prev_team_h_score3',
'prev_total_points3','prev_value3', 'prev_was_home3', 'prev_winning_goals3', 'prev_yellow_cards3']

In [187]:
data.fillna(0.0,inplace=True)


In [205]:
from sklearn.naive_bayes import BernoulliNB 
model = BernoulliNB()
classification_model(model, data,data[predictor_var_all],data[outcome_var])

Accuracy : 52.888%
Cross-Validation Score : 44.242%


In [206]:
model = LogisticRegression()
classification_model(model, data,data[predictor_var_all],data[outcome_var])

Accuracy : 61.913%
Cross-Validation Score : 41.337%


In [162]:
scaler = StandardScaler()
X = scaler.fit_transform(data[predictor_var_all])
Xnew = pd.DataFrame(X)

params_grid = {'C': [0.001, 0.01, 0.1,0.8, 1,5, 10,50, 100],
          'gamma': [0.0001, 0.001, 0.01, 0.1,1],
          'kernel':['rbf'] }
#Create the GridSearchCV object
grid_clf = GridSearchCV(svm.SVC(class_weight='balanced'), params_grid)

#Fit the data with the best possible parameters
grid_clf = grid_clf.fit(X_new2, data[outcome_var])

#Print the best estimator with it's parameters
print (grid_clf.best_estimator_)



SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [175]:
model2 = svm.SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
classification_model(model2, data,Xnew,data[outcome_var])

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import  SelectPercentile, f_classif
X_new2 = SelectPercentile(f_classif, percentile=25).fit_transform(Xnew, data[outcome_var])
Xnew2 = pd.DataFrame(X_new2)
classification_model(model2, data,Xnew2,data[outcome_var])
#cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
#plot_learning_curve(model2, 'SVM 1', Xnew, data[outcome_var], ylim=(0.3, 1.01), cv=cv, n_jobs=4)
#plot_learning_curve(model2, 'SVM 2', Xnew2, data[outcome_var], ylim=(0.3, 1.01), cv=cv, n_jobs=4)
#plt.show()

Accuracy : 32.671%
Cross-Validation Score : 30.144%


 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90] are constant.
  f = msb / msw


Accuracy : 82.491%
Cross-Validation Score : 39.746%


In [219]:
from sklearn.feature_selection import SelectFromModel

model3 = DecisionTreeClassifier()
classification_model(model3, data,data[predictor_var_all],data[outcome_var])
#m = SelectFromModel(model3, prefit=True)
#X_new2 = m.transform(data[predictor_var_all])
#Xnew2 = pd.DataFrame(X_new2)
#classification_model(model3, data,Xnew2,data[outcome_var])
import graphviz 
dot_data = export_graphviz(model3, out_file=None,feature_names=predictor_var_all,  
                         class_names=outcome_var,  
                         filled=True, rounded=True,  
                         special_characters=True) 
graph = graphviz.Source(dot_data) 
graph
#cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
#plot_learning_curve(model3, 'Decision Tree 1', data[predictor_var_all], data[outcome_var], ylim=(0.3, 1.01), cv=cv, n_jobs=4)
#plot_learning_curve(model3, 'Decision Tree 2', Xnew2, data[outcome_var], ylim=(0.3, 1.01), cv=cv, n_jobs=4)
#plt.show()

Accuracy : 99.097%
Cross-Validation Score : 38.446%


IndexError: string index out of range

In [171]:
model4 = RandomForestClassifier(n_estimators = 30,max_depth=7)#,max_features='log2')
classification_model(model4, data,data[predictor_var_all],data[outcome_var])
m = SelectFromModel(model4, prefit=True)
X_new2 = m.transform(data[predictor_var_all])
Xnew2 = pd.DataFrame(X_new2)
classification_model(model4, data,Xnew2,data[outcome_var])

Accuracy : 74.368%
Cross-Validation Score : 50.540%
Accuracy : 77.617%
Cross-Validation Score : 49.273%
