# Midfilder OLS

### Import Package

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score


### Connect DB & Get Midfilder Player Data 

In [2]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%M%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# midfilder
SQL_QUERY = make_query("M")
midfilder_df = pd.read_sql(SQL_QUERY, db)

len(midfilder_df)


1582

### Scaling

In [3]:
X = midfilder_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(midfilder_df.ix[:,-1], columns=["rating"])
m_df = pd.concat([dfX, dfy], axis=1)
m_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,7.318647,14.8008,10.561215,2.780837,0.704054,2.879113,1.876429,2.456496,3.478088,...,0.639768,0.0,1.436757,4.101984,1.808584,2.168207,3.033856,2.165255,9.363235,7.25
1,1,6.813913,14.961678,10.039673,3.089819,0.938738,3.262186,0.536123,1.228248,2.086853,...,2.559071,0.0,0.718378,0.984476,0.0,1.000711,1.179833,3.107282,11.568079,7.05
2,1,7.318647,14.076848,9.387747,3.295807,0.938738,3.515548,0.268061,2.04708,2.78247,...,0.639768,0.0,3.232703,2.46119,0.0,1.834636,1.011285,3.704836,11.950252,7.03
3,1,6.813913,14.479043,9.909288,3.398801,0.234685,3.498576,0.268061,1.228248,2.086853,...,0.639768,0.0,1.257162,0.984476,0.452146,1.167496,1.34838,2.404277,10.377463,7.02
4,1,7.06628,14.398604,8.73582,2.265867,1.408107,2.32996,0.804184,3.275329,1.043426,...,0.0,0.0,2.693919,1.804873,1.356438,2.501777,2.022571,1.638001,11.303498,6.96


### Summary OLS

In [4]:
model = sm.OLS(m_df.ix[:, -1], m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.857
Method:                 Least Squares   F-statistic:                     340.7
Date:                Thu, 30 Jun 2016   Prob (F-statistic):               0.00
Time:                        12:20:10   Log-Likelihood:                 1058.1
No. Observations:                1582   AIC:                            -2058.
Df Residuals:                    1553   BIC:                            -1903.
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6114      0.075     74.415      0.0

### Find Proper Model

In [5]:
# remove features
remove_column_list = [
    "tall", "weight", "apps_start", "apps_sub", "mins", "offsides", "blocks", "owng", "off", "disp", "unstch"
]
removed_m_df = m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_m_df.ix[:, -1], removed_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.857
Model:                            OLS   Adj. R-squared:                  0.855
Method:                 Least Squares   F-statistic:                     549.8
Date:                Thu, 30 Jun 2016   Prob (F-statistic):               0.00
Time:                        12:20:53   Log-Likelihood:                 1039.5
No. Observations:                1582   AIC:                            -2043.
Df Residuals:                    1564   BIC:                            -1946.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6499      0.058     96.830      0.0

### ANOVA

In [6]:
formula_str = """
rating ~ age + goals + assists + yel + red
+ spg + ps_x + motm + aw + tackles + inter + fouls + clear + drb
+ keyp_x + fouled + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_m_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
age,1.0,0.011797,0.011797,0.741286,0.3893811
goals,1.0,46.463072,46.463072,2919.695676,0.0
assists,1.0,22.78454,22.78454,1431.75902,5.364498e-223
yel,1.0,9.644631,9.644631,606.059511,2.255951e-113
red,1.0,0.105741,0.105741,6.644657,0.01003618
spg,1.0,0.848527,0.848527,53.320646,4.492423e-13
ps_x,1.0,8.528253,8.528253,535.907372,3.4129609999999997e-102
motm,1.0,12.23869,12.23869,769.067724,5.237508e-138
aw,1.0,12.74686,12.74686,801.000648,1.2485349999999999e-142
tackles,1.0,18.915697,18.915697,1188.644595,3.1279800000000003e-194


In [7]:
# remove feature 2
remove_column_list = [
    "age", "red", "drb", "ps_y"
]
removed2_m_df = removed_m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_m_df.ix[:, -1], removed2_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.852
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                     642.1
Date:                Thu, 30 Jun 2016   Prob (F-statistic):               0.00
Time:                        12:21:21   Log-Likelihood:                 1011.9
No. Observations:                1582   AIC:                            -1994.
Df Residuals:                    1567   BIC:                            -1913.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5220      0.053    105.055      0.0

### Result

Key Features : tackles, key pass, goals, intercept, shots per game