# Midfilder OLS
- Import Package
- Connect DB & get Forword Player Data
- Scaling
- Summary OLS 
- Remove Feature
- Anova & Remove Feature

### Import Package

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

### Connect DB & get Forword Player Data

In [13]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%"
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("M")
midfilder_df = pd.read_sql(SQL_QUERY, db)

len(midfilder_df)

1969

##### Scaling

In [14]:
X = midfilder_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(midfilder_df.ix[:,-1], columns=["rating"])
m_df = pd.concat([dfX, dfy], axis=1)
m_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,6.906678,9.336873,7.611064,2.454588,0.733024,2.515521,2.007993,2.57708,3.463782,...,0.602366,0.0,1.366639,3.817549,1.862443,2.061418,2.806716,2.01349,5.928928,7.25
1,1,6.430355,9.43836,7.235209,2.72732,0.977365,2.850218,0.573712,1.28854,2.078269,...,2.409464,0.0,0.683319,0.916212,0.0,0.951424,1.091501,2.889489,7.325065,7.05
2,1,6.906678,8.880178,6.76539,2.909141,0.977365,3.071584,0.286856,2.147567,2.771026,...,0.602366,0.0,3.074937,2.290529,0.0,1.744277,0.935572,3.44516,7.567062,7.03
3,1,6.430355,9.133897,7.141245,3.000052,0.244341,3.056756,0.286856,1.28854,2.078269,...,0.602366,0.0,1.195809,0.916212,0.465611,1.109994,1.247429,2.235759,6.571151,7.02
4,1,6.668516,9.083153,6.295571,2.000035,1.466047,2.035719,0.860568,3.436107,1.039135,...,0.0,0.0,2.562448,1.679722,1.396832,2.378559,1.871144,1.523192,7.157528,6.96


##### Summary OLS

In [15]:
model = sm.OLS(m_df.ix[:, -1], m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.871
Method:                 Least Squares   F-statistic:                     476.6
Date:                Sat, 25 Jun 2016   Prob (F-statistic):               0.00
Time:                        18:54:42   Log-Likelihood:                 1123.3
No. Observations:                1969   AIC:                            -2189.
Df Residuals:                    1940   BIC:                            -2027.
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9284      0.042    142.758      0.0

##### Remove Some Feature

In [21]:
# remove features
remove_column_list = [
    "tall", "weight", "apps_start", "offsides", "blocks", "owng", "off", "unstch"
]
removed_m_df = m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_m_df.ix[:, -1], removed_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.872
Model:                            OLS   Adj. R-squared:                  0.871
Method:                 Least Squares   F-statistic:                     664.7
Date:                Sat, 25 Jun 2016   Prob (F-statistic):               0.00
Time:                        18:56:49   Log-Likelihood:                 1116.5
No. Observations:                1969   AIC:                            -2191.
Df Residuals:                    1948   BIC:                            -2074.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9362      0.032    183.268      0.0

##### Anova &  Remove Feature

In [27]:
formula_str = """
rating ~ age + apps_sub + mins + goals + assists + yel + red
+ spg + ps_x + motm + aw + tackles + inter + fouls + clear + drb
+ keyp_x + fouled + disp + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_m_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
age,1.0,5.555034,5.555034,291.767195,4.573343e-61
apps_sub,1.0,15.789608,15.789608,829.317984,3.071282e-152
mins,1.0,129.16644,129.16644,6784.212041,0.0
goals,1.0,13.822363,13.822363,725.992308,3.509607e-136
assists,1.0,9.916291,9.916291,520.833568,2.326861e-102
yel,1.0,0.51146,0.51146,26.863404,2.409623e-07
red,1.0,0.032478,0.032478,1.70584,0.1916802
spg,1.0,6.876193,6.876193,361.158432,5.181798e-74
ps_x,1.0,4.997641,4.997641,262.491232,1.7607240000000002e-55
motm,1.0,6.498781,6.498781,341.335645,2.353464e-70


In [28]:
# remove feature 2
remove_column_list = [
    "yel", "red", "fouls", "clear", "drb", "disp", "ps_y"
]
removed2_m_df = removed_m_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_m_df.ix[:, -1], removed2_m_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.859
Method:                 Least Squares   F-statistic:                     859.7
Date:                Sat, 25 Jun 2016   Prob (F-statistic):               0.00
Time:                        19:03:07   Log-Likelihood:                 1029.0
No. Observations:                1969   AIC:                            -2028.
Df Residuals:                    1954   BIC:                            -1944.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9415      0.033    178.564      0.0