# Forword OLS
- Import Package
- Connect DB & get Forword Player Data
- Scaling
- Summary OLS 
- Remove Feature

### Import Package

In [25]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

### Connect DB & get Forword Player Data

In [41]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%FW%" and mins > 270
        ;
    """
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)

len(forword_df)

568

##### Scaling

In [42]:
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
f_df = pd.concat([dfX, dfy], axis=1)
f_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,8.881893,12.413956,7.79213,3.0351,0.820739,3.328624,3.316431,2.361285,3.93832,...,1.339995,0.0,2.015426,3.006779,3.210278,3.736803,3.471768,2.285979,8.192403,7.37
1,1,7.359283,12.619713,8.4155,2.73159,0.615554,2.847796,1.160751,2.361285,4.375911,...,1.339995,0.0,1.61234,3.758474,1.167374,2.024102,2.717036,3.352769,8.611487,7.25
2,1,7.359283,12.619713,8.311605,0.30351,1.231108,0.346532,0.0,0.0,0.0,...,2.679991,0.0,0.403085,0.451017,0.0,1.401301,0.905679,1.088562,9.909293,6.22
3,1,6.344209,12.002444,7.480445,3.64212,0.410369,3.666762,3.648074,1.967737,2.187956,...,1.339995,0.0,2.821596,1.503389,1.167374,2.024102,2.717036,3.886165,10.585234,7.5
4,1,8.120588,12.688298,8.20771,1.71989,2.6674,2.02883,1.824037,1.57419,1.312773,...,1.339995,0.0,1.410798,1.804067,1.459217,2.179802,2.415143,1.295388,8.908901,6.98


##### Summary OLS

In [45]:
model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     231.0
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          2.04e-279
Time:                        18:34:05   Log-Likelihood:                 441.31
No. Observations:                 568   AIC:                            -824.6
Df Residuals:                     539   BIC:                            -698.7
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6999      0.101     56.635      0.0

##### Remove Some Feature

In [46]:
# remove features
remove_column_list = [
    "age", "tall", "weight", "apps_start", "apps_sub", "red", "blocks", "owng", "unstch", "offsides", "off"
]
removed_f_df = f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_f_df.ix[:, -1], removed_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.918
Method:                 Least Squares   F-statistic:                     373.6
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          6.75e-289
Time:                        18:35:00   Log-Likelihood:                 431.29
No. Observations:                 568   AIC:                            -826.6
Df Residuals:                     550   BIC:                            -748.4
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6751      0.069     82.115      0.0

In [47]:
formula_str = """
rating ~ mins + goals + assists + yel + spg + ps_x
+ motm + aw + tackles + inter + clear + fouls + drb + keyp_x
+ fouled + disp + avgp + ps_y
"""

model = sm.OLS.from_formula(formula_str, data=removed_f_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
mins,1.0,40.781499,40.781499,3079.515588,1.616071e-227
goals,1.0,14.195896,14.195896,1071.968492,2.8784899999999997e-131
assists,1.0,7.381833,7.381833,557.421144,1.2404229999999999e-85
yel,1.0,0.117953,0.117953,8.906936,0.002967139
spg,1.0,6.150312,6.150312,464.425852,3.8854500000000005e-75
ps_x,1.0,0.519853,0.519853,39.255455,7.509462e-10
motm,1.0,2.729217,2.729217,206.090173,6.36423e-40
aw,1.0,2.076636,2.076636,156.812148,7.887594e-32
tackles,1.0,4.682393,4.682393,353.579482,2.773806e-61
inter,1.0,1.241805,1.241805,93.771875,1.392619e-20


In [49]:
# remove feature 2
remove_column_list = [
    "clear", "fouls", "drb", "disp", "ps_y"
]
removed2_f_df = removed_f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_f_df.ix[:, -1], removed2_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.917
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     471.2
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          1.94e-289
Time:                        18:36:20   Log-Likelihood:                 419.99
No. Observations:                 568   AIC:                            -812.0
Df Residuals:                     554   BIC:                            -751.2
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6602      0.070     81.202      0.0