In this notebook we analyze the dataset to obtain a team of best players. We choose the players with highest overall for our formation 4-3-3

If you are running all the notebooks make sure you run this one first to generate the csv file with best players. `Predicting_Games_Specific_Season` notebook will use that csv file.

In [1]:
## Importing required libraries
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from time import time
from sklearn.decomposition import PCA, FastICA
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
database = 'database.sqlite'
conn = sqlite3.connect(database)

In [3]:
#Fetching required data tables
country_data = pd.read_sql("SELECT * FROM Country;", conn)
league_data = pd.read_sql("SELECT * FROM League;", conn)
match_data = pd.read_sql("SELECT * FROM Match;", conn)
player_data = pd.read_sql("SELECT * FROM Player;", conn)
player_attr_data = pd.read_sql("SELECT * FROM Player_Attributes;", conn)
team_data = pd.read_sql("SELECT * FROM Team;", conn)
team_attr_data = pd.read_sql("SELECT * FROM Team_Attributes;", conn)

In [4]:
starting_date = '2015-09-01'
ending_date = '2016-01-01'

In [5]:
higher_up = player_attr_data[player_attr_data['date'] > starting_date]
player_overall_df = higher_up[higher_up['date'] <  ending_date].drop_duplicates(subset = ["player_api_id"])


## currently we are only concerned with overall_rating

# player_overall_df = player_overall_df[['player_api_id', 'overall_rating']]
overall_higher = player_overall_df[player_overall_df['overall_rating'] > 80]

In [6]:
player_overall_df.columns

Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')

In [7]:
overall_higher[overall_higher["defensive_work_rate"] == 'high'].sort_values(by=['overall_rating'], ascending=False).head(4)

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
169805,169806,164240,80562,2015-09-21 00:00:00,88.0,88.0,right,high,high,60.0,...,74.0,71.0,90.0,91.0,89.0,9.0,12.0,5.0,9.0,10.0
143231,143232,121939,30894,2015-11-26 00:00:00,87.0,87.0,right,medium,high,84.0,...,83.0,71.0,87.0,87.0,95.0,11.0,12.0,5.0,14.0,5.0
64464,64465,138956,41884,2015-11-26 00:00:00,86.0,86.0,left,low,high,52.0,...,46.0,50.0,90.0,90.0,90.0,3.0,3.0,2.0,4.0,3.0
170800,170801,189596,116772,2015-10-23 00:00:00,86.0,88.0,right,high,high,79.0,...,85.0,90.0,32.0,41.0,44.0,6.0,7.0,11.0,14.0,14.0


In [8]:
df_best_11 = pd.DataFrame()

### Obtain the goal keeper for the team

In [9]:
best_goal_keeper = overall_higher[overall_higher["gk_reflexes"] > 85].sort_values(by=['overall_rating'], ascending=False).iloc[0:1]
goal_keeper_api = best_goal_keeper["player_api_id"]

In [10]:
goal_keeper_api.values[0]

27299

In [11]:
df_best_11 = df_best_11.append(player_data[player_data['player_api_id'] == goal_keeper_api.values[0]])

In [12]:
def generate_players_positions(defensive_rate, attacking_rate, number):
    best_players = overall_higher[overall_higher["defensive_work_rate"] == defensive_rate]\
    [overall_higher[overall_higher["defensive_work_rate"] == defensive_rate]\
     ['attacking_work_rate']== attacking_rate].sort_values(by=['overall_rating'], ascending=False).head(number)
    return best_players

### We have opted for 4-3-3 formation. A good defender should have high defending work rate and low attacking work rate

In [13]:
best_defenders = generate_players_positions("high", "low", 4)
best_defenders

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
64464,64465,138956,41884,2015-11-26 00:00:00,86.0,86.0,left,low,high,52.0,...,46.0,50.0,90.0,90.0,90.0,3.0,3.0,2.0,4.0,3.0
11450,11451,137186,30902,2015-12-03 00:00:00,84.0,84.0,right,low,high,40.0,...,50.0,50.0,90.0,90.0,87.0,4.0,2.0,4.0,2.0,4.0
162198,162199,192227,184999,2015-12-10 00:00:00,82.0,87.0,right,low,high,51.0,...,52.0,54.0,81.0,82.0,85.0,11.0,9.0,15.0,10.0,6.0
61667,61668,193532,49970,2015-12-17 00:00:00,81.0,81.0,right,low,high,59.0,...,70.0,62.0,80.0,83.0,80.0,14.0,7.0,9.0,6.0,6.0


### A good midfielder should have low defending work rate and medium attacking work rate

In [14]:
best_midfielders = generate_players_positions("low", "medium", 3)
best_midfielders

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
102482,102483,158023,30981,2015-12-17 00:00:00,94.0,94.0,left,medium,low,80.0,...,90.0,74.0,13.0,23.0,21.0,6.0,11.0,15.0,14.0,8.0
183672,183673,41236,35724,2015-09-21 00:00:00,89.0,89.0,right,medium,low,76.0,...,83.0,91.0,15.0,41.0,27.0,13.0,15.0,10.0,9.0,12.0
122443,122444,176635,36378,2015-09-21 00:00:00,87.0,88.0,left,medium,low,80.0,...,92.0,76.0,22.0,16.0,19.0,6.0,14.0,10.0,6.0,14.0


### A good attacker should have high attacking work rate and low defensive work rate

In [15]:
best_attackers = generate_players_positions("low", "high", 3)
best_attackers

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
33330,33331,20801,30893,2015-10-16 00:00:00,93.0,93.0,right,high,low,82.0,...,81.0,85.0,22.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
16461,16462,9014,30834,2015-09-25 00:00:00,90.0,90.0,left,high,low,80.0,...,84.0,80.0,29.0,26.0,26.0,10.0,8.0,11.0,5.0,15.0
40430,40431,168542,37459,2015-09-21 00:00:00,88.0,88.0,left,high,low,85.0,...,93.0,77.0,23.0,30.0,29.0,1.0,1.0,1.0,1.0,1.0


In [16]:
player_api_id_val = np.array([])

In [17]:
player_api_id_val = np.append(player_api_id_val, best_defenders['player_api_id'].values)
player_api_id_val = np.append(player_api_id_val, best_midfielders['player_api_id'].values)
player_api_id_val = np.append(player_api_id_val, best_attackers['player_api_id'].values)

In [18]:
player_api_id_val

array([ 41884.,  30902., 184999.,  49970.,  30981.,  35724.,  36378.,
        30893.,  30834.,  37459.])

In [19]:
for element in player_api_id_val:
    df_best_11 = df_best_11.append(player_data[player_data['player_api_id'] == element])

## A team of best players for 2015/16 season

In [20]:
df_best_11

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
6546,6556,27299,Manuel Neuer,167495,1986-03-27 00:00:00,193.04,203
3862,3866,41884,Giorgio Chiellini,138956,1984-08-14 00:00:00,187.96,190
676,679,30902,Andrea Barzagli,137186,1981-05-08 00:00:00,187.96,192
9737,9753,184999,Shkodran Mustafi,192227,1992-04-17 00:00:00,182.88,181
3687,3691,49970,Gary Medel,193532,1987-08-03 00:00:00,170.18,168
6169,6176,30981,Lionel Messi,158023,1987-06-24 00:00:00,170.18,159
11041,11057,35724,Zlatan Ibrahimovic,41236,1981-10-03 00:00:00,195.58,209
7316,7327,36378,Mesut Oezil,176635,1988-10-15 00:00:00,182.88,168
1992,1995,30893,Cristiano Ronaldo,20801,1985-02-05 00:00:00,185.42,176
948,951,30834,Arjen Robben,9014,1984-01-23 00:00:00,180.34,176


In [21]:
best_11_team_data = pd.DataFrame()

In [22]:
best_11_team_data = pd.concat([best_goal_keeper, best_defenders, best_midfielders, best_attackers])

In [23]:
best_11_team_data

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
109036,109037,167495,27299,2015-11-26 00:00:00,90.0,90.0,right,medium,medium,15.0,...,70.0,37.0,10.0,10.0,11.0,85.0,87.0,91.0,90.0,87.0
64464,64465,138956,41884,2015-11-26 00:00:00,86.0,86.0,left,low,high,52.0,...,46.0,50.0,90.0,90.0,90.0,3.0,3.0,2.0,4.0,3.0
11450,11451,137186,30902,2015-12-03 00:00:00,84.0,84.0,right,low,high,40.0,...,50.0,50.0,90.0,90.0,87.0,4.0,2.0,4.0,2.0,4.0
162198,162199,192227,184999,2015-12-10 00:00:00,82.0,87.0,right,low,high,51.0,...,52.0,54.0,81.0,82.0,85.0,11.0,9.0,15.0,10.0,6.0
61667,61668,193532,49970,2015-12-17 00:00:00,81.0,81.0,right,low,high,59.0,...,70.0,62.0,80.0,83.0,80.0,14.0,7.0,9.0,6.0,6.0
102482,102483,158023,30981,2015-12-17 00:00:00,94.0,94.0,left,medium,low,80.0,...,90.0,74.0,13.0,23.0,21.0,6.0,11.0,15.0,14.0,8.0
183672,183673,41236,35724,2015-09-21 00:00:00,89.0,89.0,right,medium,low,76.0,...,83.0,91.0,15.0,41.0,27.0,13.0,15.0,10.0,9.0,12.0
122443,122444,176635,36378,2015-09-21 00:00:00,87.0,88.0,left,medium,low,80.0,...,92.0,76.0,22.0,16.0,19.0,6.0,14.0,10.0,6.0,14.0
33330,33331,20801,30893,2015-10-16 00:00:00,93.0,93.0,right,high,low,82.0,...,81.0,85.0,22.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
16461,16462,9014,30834,2015-09-25 00:00:00,90.0,90.0,left,high,low,80.0,...,84.0,80.0,29.0,26.0,26.0,10.0,8.0,11.0,5.0,15.0


In [24]:
best_team_overall = best_11_team_data['overall_rating'].values

### Write into a csv file so that the models created in another notebook can access this data

In [25]:
import csv

# open the file in the write mode
f = open('best_team_overall.csv', 'w')

# create the csv writer
writer = csv.writer(f)

# write a row to the csv file
writer.writerow(best_team_overall)

# close the file
f.close()