# Predicting the Overall Rating of Soccer Players 

In this notebook, we will try to predict the overall rating of the Players. We will use the dataset containing the attributes of the Soccer Players. The dataset is maintained in European Soccer Database. 
We'll use various ML models and find the model which will help us achieve the accuracy over 90 %. 
Lets begin.

## Importing Libraries 

In [1]:
import sqlite3
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import cross_val_score

## Read Data from the database into Pandas 

In [2]:
conn = sqlite3.connect('../input/database.sqlite')
soccer_data = pd.read_sql("select * from Player_Attributes", conn)
soccer_data.sample(10)

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
146691,146692,198212,181275,2010-08-30 00:00:00,52.0,68.0,right,medium,medium,56.0,...,48.0,61.0,49.0,52.0,38.0,14.0,5.0,5.0,5.0,14.0
171957,171958,53012,27313,2015-09-21 00:00:00,68.0,68.0,right,medium,medium,13.0,...,9.0,31.0,13.0,16.0,15.0,55.0,72.0,53.0,73.0,75.0
174456,174457,138507,37579,2011-02-22 00:00:00,75.0,80.0,right,medium,medium,11.0,...,36.0,23.0,11.0,11.0,12.0,78.0,67.0,72.0,74.0,81.0
158647,158648,198373,127134,2015-09-21 00:00:00,61.0,68.0,right,medium,high,52.0,...,57.0,51.0,45.0,51.0,43.0,14.0,15.0,9.0,6.0,10.0
66096,66097,146475,38546,2011-08-30 00:00:00,65.0,65.0,right,low,medium,43.0,...,69.0,54.0,51.0,68.0,59.0,10.0,12.0,7.0,14.0,6.0
97041,97042,201949,120110,2014-02-28 00:00:00,69.0,70.0,right,medium,medium,52.0,...,54.0,62.0,24.0,23.0,20.0,12.0,10.0,7.0,11.0,13.0
160515,160516,189077,121093,2008-08-30 00:00:00,52.0,57.0,right,medium,medium,31.0,...,36.0,36.0,47.0,46.0,47.0,7.0,22.0,32.0,22.0,22.0
55018,55019,182105,171981,2014-02-28 00:00:00,77.0,77.0,right,medium,medium,25.0,...,25.0,25.0,25.0,25.0,25.0,77.0,72.0,76.0,77.0,80.0
110710,110711,180334,78908,2014-11-21 00:00:00,73.0,75.0,right,medium,medium,25.0,...,36.0,58.0,71.0,70.0,73.0,15.0,14.0,9.0,6.0,13.0
157517,157518,188075,128870,2009-08-30 00:00:00,69.0,76.0,right,medium,medium,60.0,...,70.0,59.0,71.0,70.0,77.0,5.0,22.0,64.0,22.0,22.0


## Exploratory Data Analysis : Cleaning, Correcting and Visualizing 

In [None]:
soccer_data.shape

In [None]:
soccer_data.columns 

In [None]:
# lets look for null values 
soccer_data.isnull().sum()

In [None]:
# which columns to keep ?  all except first four 
new_soccer_data = soccer_data.copy()
new_soccer_data = new_soccer_data.drop(["id", "player_fifa_api_id", "player_api_id", "date"], axis = 1)
new_soccer_data.head()

### 1. Initial Exploration

In [None]:
new_soccer_data.duplicated().any()

In [None]:
# removing the duplicates   
new_soccer_data = new_soccer_data.drop_duplicates()
new_soccer_data.shape

In [None]:
new_soccer_data.info()

In [None]:
new_soccer_data.describe()

### 2. Correcting 

In [None]:
new_soccer_data.isnull().sum()

In [None]:
new_soccer_data['preferred_foot'].value_counts()

In [None]:
new_soccer_data['preferred_foot'] = new_soccer_data['preferred_foot'].map({'right': 0, 'left': 1})

In [None]:
new_soccer_data['attacking_work_rate'].value_counts()

In [None]:
new_soccer_data.replace(to_replace = ['None', 'y', 'le', 'stoc', 'norm'],value = 'medium', inplace = True)
new_soccer_data['attacking_work_rate'].value_counts()

In [None]:
new_soccer_data['attacking_work_rate'] = new_soccer_data['attacking_work_rate'].map({'medium': 1, 'low': 0, 'high' : 2})

In [None]:
new_soccer_data['defensive_work_rate'].value_counts()

In [None]:
new_soccer_data.replace(['_0', 'o', '1', 'ormal', '2', '3', '7', '5', '6', '0', '9', 'es', '4', 'ean', 'tocky', '8'], 'medium', inplace = True)

In [None]:
new_soccer_data['defensive_work_rate'] = new_soccer_data['defensive_work_rate'].map({'medium': 1, 'low': 0, 'high' : 2})

In [None]:
new_soccer_data.fillna(new_soccer_data.mean(), inplace = True)
new_soccer_data.isnull().any()

### 3. Visualize the data

In [None]:
sns.set()

In [None]:
fig, axes = plt.subplots(len(new_soccer_data.columns)//3, 3, figsize=(12, 48))

i = 0
for triaxis in axes:
    for axis in triaxis:
        new_soccer_data.hist(column = new_soccer_data.columns[i], bins = 100, ax=axis)
        i = i+1 
plt.show()

In [None]:
fig, axes = plt.subplots(len(new_soccer_data.columns)//3, 3, figsize=(12, 48))

i = 0
for triaxis in axes:
    for axis in triaxis:
        new_soccer_data.boxplot(column = new_soccer_data.columns[i], ax=axis)
        i = i+1 
plt.show()

## Feature Selection and Model Selection

In [None]:
columns = ['potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes']
X = new_soccer_data[columns]
y = new_soccer_data['overall_rating']
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 30)
fit = pca.fit(X_std)
X_new = pca.fit_transform(X_std)

In [None]:
fit.explained_variance_.sum()

In [None]:
lnr = LinearRegression()
print(cross_val_score(lnr, X_new, y,cv = 5).mean()) 

In [None]:
rgs = DecisionTreeRegressor()
print(cross_val_score(rgs, X_new, y, cv = 5).mean())

In [None]:
rfr = RandomForestRegressor(10)
print(cross_val_score(rfr, X_new, y, cv = 5).mean())

In [None]:
from sklearn.neural_network import MLPRegressor
mlpr = MLPRegressor((50,), 'relu')
print(cross_val_score(mlpr, X_new, y, cv = 5).mean())

In [None]:
print("Finally we can predict the Player's overall rating with over 95% accuracy")