# Forword OLS
- Import Package
- Connect DB & get Forword Player Data
- Scaling
- Summary OLS 
- Remove Feature

### Import Package

In [10]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

### Connect DB & get Forword Player Data

In [11]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
        FROM player
        WHERE position like "%{position}%" and mins > 270
        ;
    """.format(position=position)
    
    return SQL_QUERY

# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)

len(forword_df)

633

##### Scaling

In [12]:
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
f_df = pd.concat([dfX, dfy], axis=1)
f_df.head()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,blocks,owng,keyp_x,fouled,off,disp,unstch,avgp,ps_y,rating
0,1,8.337582,8.522711,6.297838,2.987833,0.822421,3.274703,3.364492,2.416302,4.02956,...,0.658349,0.0,2.013513,3.028137,3.140111,3.652482,3.310517,2.293488,8.100223,7.37
1,1,6.908282,8.663971,6.801665,2.68905,0.616816,2.801664,1.177572,2.416302,4.477289,...,0.658349,0.0,1.610811,3.785171,1.141858,1.978428,2.590839,3.363782,8.514591,7.25
2,1,6.908282,8.663971,6.717694,0.298783,1.233632,0.340918,0.0,0.0,0.0,...,1.316697,0.0,0.402703,0.454221,0.0,1.369681,0.863613,1.092137,9.797794,6.22
3,1,5.955416,8.24019,6.045925,3.5854,0.411211,3.607364,3.700941,2.013585,2.238645,...,0.658349,0.0,2.818918,1.514068,1.141858,1.978428,2.590839,3.898929,10.46613,7.5
4,1,7.622932,8.711058,6.633723,1.693105,2.67287,1.995964,1.850471,1.610868,1.343187,...,0.658349,0.0,1.409459,1.816882,1.427323,2.130614,2.302968,1.299643,8.808658,6.98


##### Summary OLS

In [13]:
model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.910
Method:                 Least Squares   F-statistic:                     228.4
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          5.77e-300
Time:                        16:32:32   Log-Likelihood:                 464.87
No. Observations:                 633   AIC:                            -871.7
Df Residuals:                     604   BIC:                            -742.7
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.6720      0.088     64.260      0.0

##### Remove Some Feature

In [14]:
# remove features
remove_column_list = [
    "age", "tall", "weight", "apps_start", "red", "ps_x", "fouls", "offsides", "owng", "off", "disp", "clear", "ps_y"
]
removed_f_df = f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed_f_df.ix[:, -1], removed_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.905
Model:                            OLS   Adj. R-squared:                  0.903
Method:                 Least Squares   F-statistic:                     367.8
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          1.73e-302
Time:                        16:32:35   Log-Likelihood:                 435.36
No. Observations:                 633   AIC:                            -836.7
Df Residuals:                     616   BIC:                            -761.1
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9801      0.026    231.264      0.0

In [18]:
removed_f_df.columns

Index(['const', 'apps_sub', 'mins', 'goals', 'assists', 'yel', 'spg', 'motm',
       'aw', 'tackles', 'inter', 'drb', 'blocks', 'keyp_x', 'fouled', 'unstch',
       'avgp', 'rating'],
      dtype='object')

In [19]:
formula_str = """
rating ~ apps_sub + mins + goals + assists + yel + spg 
+ motm + aw + tackles + inter + drb + blocks + keyp_x
+ fouled + unstch + avgp
"""

model = sm.OLS.from_formula(formula_str, data=removed_f_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
apps_sub,1.0,22.406435,22.406435,1473.708568,1.538062e-165
mins,1.0,29.587045,29.587045,1945.989229,8.232292e-193
goals,1.0,12.744298,12.744298,838.213742,5.338944e-117
assists,1.0,7.438457,7.438457,489.239693,3.085503e-80
yel,1.0,0.019497,0.019497,1.282329,0.257907
spg,1.0,2.116324,2.116324,139.194144,4.168139e-29
motm,1.0,2.5928,2.5928,170.532763,1.4012149999999999e-34
aw,1.0,0.567614,0.567614,37.332906,1.766072e-09
tackles,1.0,6.333253,6.333253,416.548585,4.069015e-71
inter,1.0,0.876774,0.876774,57.66691,1.156798e-13


In [24]:
# remove feature 2
remove_column_list = [
    "drb", "apps_sub", "blocks"
]
removed2_f_df = removed_f_df.drop(remove_column_list, axis=1) 

model = sm.OLS(removed2_f_df.ix[:, -1], removed2_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.904
Model:                            OLS   Adj. R-squared:                  0.902
Method:                 Least Squares   F-statistic:                     450.0
Date:                Sun, 26 Jun 2016   Prob (F-statistic):          4.03e-305
Time:                        16:35:28   Log-Likelihood:                 432.22
No. Observations:                 633   AIC:                            -836.4
Df Residuals:                     619   BIC:                            -774.1
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9779      0.016    369.991      0.0