In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'svg', 'retina'}

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

In [2]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "doojin",
    charset='utf8',
)

In [3]:
def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    SQL_QUERY = """
        SELECT 
            age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
            , spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
            , owng, keyp, fouled, off, disp, unstch, avgp, ps_y, crosses, longb, thrb, rating
        FROM player
        WHERE position like "%{position}%"
        ;
    """.format(position=position)
    
    return SQL_QUERY

In [4]:
# make each position player datas

# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)

# midfilder
SQL_QUERY = make_query("M")
midfilder_df = pd.read_sql(SQL_QUERY, db)

# defencer
SQL_QUERY = make_query("D")
defencer_df = pd.read_sql(SQL_QUERY, db)

# goalkeeper
SQL_QUERY = make_query("G")
goalkeeper_df = pd.read_sql(SQL_QUERY, db)

# the number of each postion player
len(forword_df), len(midfilder_df), len(defencer_df), len(goalkeeper_df)

(771, 1969, 1480, 289)

# Forword OLS

In [5]:
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

In [6]:
dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
df = pd.concat([dfX, dfy], axis=1)
df.tail()

Unnamed: 0,const,age,tall,weight,apps_start,apps_sub,mins,goals,assists,yel,...,fouled,off,disp,unstch,avgp,ps_y,crosses,longb,thrb,rating
766,1,5.840941,6.309013,4.770183,2.312842,0.428566,2.307712,1.393909,1.25424,4.082652,...,2.540956,2.966975,2.532471,3.954835,0.749656,1.550245,4.477563,0.0,1.085886,6.67
767,1,7.943679,6.379901,5.300203,0.925137,1.714265,1.203461,0.348477,0.41808,0.0,...,1.976299,1.0789,2.251086,2.679082,1.311897,1.898056,5.741898,0.281293,0.542943,6.54
768,1,7.242767,6.450789,4.968941,0.277541,1.285699,0.347235,0.174239,0.0,0.0,...,1.129314,3.236699,0.140693,0.765452,0.749656,0.695623,4.875724,0.0,0.180981,6.23
769,1,3.97184,0.0,0.0,0.0,0.214283,0.007549,0.0,0.0,0.0,...,0.0,0.0,0.0,1.275753,1.874139,0.596248,5.818736,2.812931,0.0,6.16
770,1,6.541854,6.521676,4.43892,0.185027,0.857132,0.222144,0.0,0.0,0.0,...,0.423493,0.53945,0.281386,0.255151,0.374828,0.874497,4.743003,0.0,0.361962,6.09


In [7]:
model = sm.OLS(df.ix[:, -1], df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.922
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     290.5
Date:                Sat, 25 Jun 2016   Prob (F-statistic):               0.00
Time:                        17:32:30   Log-Likelihood:                 530.96
No. Observations:                 771   AIC:                            -999.9
Df Residuals:                     740   BIC:                            -855.9
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9652      0.044    134.123      0.0

# Remove Some Feature

In [12]:
# remove feature
remove_column_list = ["age", "tall", "weight", "red", "ps_x", "fouls", "offsides", "owng", "off", "disp", "unstch", "crosses"]
reature_removed_df = df.drop(remove_column_list, axis=1) 

In [13]:
model = sm.OLS(reature_removed_df.ix[:, -1], reature_removed_df.ix[:, :-1])
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     460.3
Date:                Sat, 25 Jun 2016   Prob (F-statistic):               0.00
Time:                        17:40:56   Log-Likelihood:                 526.92
No. Observations:                 771   AIC:                            -1014.
Df Residuals:                     751   BIC:                            -920.9
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.9194      0.013    445.938      0.0