# import package

In [1]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import MySQLdb

from sklearn.tree import export_graphviz
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# DB

In [2]:
db = MySQLdb.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "football",
    charset='utf8',
)

# forword feature
# apps_sub, mins, goals, assists, yel, spg, motm, aw, tackles, inter, drb, blocks, keyp_x, fouled, unstch, avgp          

# defenser feature
# goals, assists, spg, ps_x, motm, aw, tackles, inter, fouls, clear, owng, keyp_x, fouled, unstch, avgp          

# midfilder feature
# age, apps_sub, mins, goals, assists, spg, ps_x, motm, aw, tackles, inter, keyp_x, fouled, avgp

def make_query(position):
    """
    
    parameter------------
    position : M, D, F, G
    
    return---------------
    SQL_QUERY String
    
    """
    
    SQL_QUERY = """
        SELECT 
            tall, weight, apps_sub, mins, goals, assists
            , spg, ps_x, motm, aw, tackles, inter, fouls, clear, drb
            , owng, keyp_x, fouled, off, disp, unstch, avgp
        FROM player
    """
    
    if position == "F":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%FW%" and mins > 270
        """
    
    if position == "M":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%M%" and mins > 270
        """
    
    if position == "D":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%D%" and position not like " DMC"  and mins > 270
        """
    
    if position == "G":
        SQL_QUERY += """
            WHERE position not like "%,%" and position like "%G%" and mins > 270
        """
    
    return SQL_QUERY

In [3]:
# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)

# midfilder
SQL_QUERY = make_query("M")
midfilder_df = pd.read_sql(SQL_QUERY, db)

# defencer
SQL_QUERY = make_query("D")
defencer_df = pd.read_sql(SQL_QUERY, db)

# goalkeeper
SQL_QUERY = make_query("G")
goalkeeper_df = pd.read_sql(SQL_QUERY, db)

len(forword_df), len(midfilder_df), len(defencer_df), len(goalkeeper_df)

(291, 997, 817, 213)

In [4]:
forword_df["position"] = 0
forword_df

midfilder_df["position"] = 1
midfilder_df

defencer_df["position"] = 2
defencer_df

goalkeeper_df["position"] = 3
goalkeeper_df

concated_df = pd.concat([forword_df, midfilder_df, defencer_df, goalkeeper_df])
concated_df.tail()

Unnamed: 0,tall,weight,apps_sub,mins,goals,assists,spg,ps_x,motm,aw,...,clear,drb,owng,keyp_x,fouled,off,disp,unstch,avgp,position
208,187,82,0,2430,0,0,0.0,59.7,0,0.5,...,1.2,0.0,0,0.0,0.3,0.0,0.0,0.1,25.5,3
209,198,83,2,1397,0,0,0.0,47.0,0,0.4,...,1.1,0.1,0,0.0,0.2,0.0,0.0,0.0,19.8,3
210,199,91,0,2020,0,0,0.0,48.2,0,0.3,...,1.1,0.0,0,0.0,0.0,0.0,0.0,0.0,22.7,3
211,188,78,0,2970,0,0,0.0,54.2,0,0.2,...,0.8,0.1,0,0.0,0.2,0.0,0.0,0.0,28.9,3
212,185,81,0,450,0,0,0.0,62.8,0,0.2,...,0.8,0.0,0,0.2,0.0,0.0,0.0,0.0,27.4,3


# DecisionTree (entropy, gini)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(concated_df.ix[:,:-1], concated_df.ix[:,-1], test_size=0.2, random_state=1)

In [6]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=3).fit(X_train, y_train)

In [7]:
confusion_matrix(y_test, model.predict(X_test))

array([[ 50,   1,   1,   0],
       [ 41, 163,  16,   0],
       [  0,  20, 138,   0],
       [  0,   0,   0,  34]])

In [8]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.55      0.96      0.70        52
          1       0.89      0.74      0.81       220
          2       0.89      0.87      0.88       158
          3       1.00      1.00      1.00        34

avg / total       0.86      0.83      0.83       464



In [9]:
model1 = DecisionTreeClassifier(criterion='gini', max_depth=3).fit(X_train, y_train)

In [10]:
confusion_matrix(y_test, model1.predict(X_test))

array([[ 46,   5,   1,   0],
       [ 33, 175,  12,   0],
       [  0,  23, 134,   1],
       [  0,   0,   0,  34]])

In [11]:
print(classification_report(y_test, model1.predict(X_test)))

             precision    recall  f1-score   support

          0       0.58      0.88      0.70        52
          1       0.86      0.80      0.83       220
          2       0.91      0.85      0.88       158
          3       0.97      1.00      0.99        34

avg / total       0.86      0.84      0.84       464



In [12]:
SQL_QUERY = """
    SELECT 
        tall, weight, apps_sub, mins, goals, assists
        , spg, ps_x, motm, aw, tackles, inter, fouls, clear, drb
        , owng, keyp_x, fouled, off, disp, unstch, avgp, position
    FROM player
    WHERE position like "%,%" and mins > 270
    ;
"""

many_position_player_df = pd.read_sql(SQL_QUERY, db)
len(many_position_player_df)

586

In [13]:
predict_data = model.predict(many_position_player_df.ix[:,:-1])
many_position_player_df["recomend_position"] = predict_data

# NaiveBayesian

In [14]:
from sklearn.naive_bayes import GaussianNB
clf_norm = GaussianNB().fit(X_train, y_train)

In [15]:
clf_norm.classes_

array([0, 1, 2, 3])

In [16]:
clf_norm.class_count_

array([ 239.,  777.,  659.,  179.])

In [17]:
clf_norm.class_prior_

array([ 0.12891046,  0.41909385,  0.35544768,  0.096548  ])

In [18]:
clf_norm.theta_, clf_norm.sigma_

(array([[  1.83075314e+02,   7.76987448e+01,   7.35983264e+00,
           1.24409205e+03,   5.18410042e+00,   1.81589958e+00,
           1.58158996e+00,   6.92046025e+01,   9.62343096e-01,
           1.80627615e+00,   5.05857741e-01,   2.60251046e-01,
           1.14853556e+00,   4.88702929e-01,   3.31799163e-01,
           1.67364017e-02,   6.38075314e-01,   1.03765690e+00,
           5.60669456e-01,   1.19456067e+00,   1.55523013e+00,
           1.53025105e+01],
        [  1.78239382e+02,   7.23886744e+01,   5.54182754e+00,
           1.48401030e+03,   2.05791506e+00,   2.05405405e+00,
           1.01981982e+00,   7.86761905e+01,   8.24967825e-01,
           8.20077220e-01,   1.48545689e+00,   1.20025740e+00,
           1.08532819e+00,   6.50707851e-01,   8.47490347e-01,
           1.80180180e-02,   8.91248391e-01,   1.06808237e+00,
           1.10810811e-01,   1.03758044e+00,   1.04826255e+00,
           3.08882883e+01],
        [  1.83716237e+02,   7.77374810e+01,   1.81031866e+00,

In [19]:
print(classification_report(y_test, clf_norm.predict(X_test), digits=4))


             precision    recall  f1-score   support

          0     0.6849    0.9615    0.8000        52
          1     0.9076    0.7591    0.8267       220
          2     0.8208    0.8987    0.8580       158
          3     1.0000    1.0000    1.0000        34

avg / total     0.8599    0.8470    0.8471       464

