In [1]:
import matplotlib.pyplot as plt
import seaborn as sns # seaborn là thư viện được xây trên matplotlib, 
                      # giúp việc visualization đỡ khổ hơn
import pandas as pd
import numpy as np
pd.set_option("max_columns", None) # all df cols

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram') # Để trực quan hóa pipeline

Description

In [2]:
with open('../Data/raw_description.txt', 'r') as f:
    for line in f:
        print(line.strip())

Date : Date
H_Rk : Home Rank
H_Team : Home Team
H_G : Home Games
H_MP : Home Minutes Played
H_FG : Home Field Goals
H_FGA : Home Field Goal Attempts
H_FG% : Home Field Goal Percentage
H_3P : Home 3-Point Field Goals
H_3PA : Home 3-Point Field Goal Attempts
H_3P% : Home 3-Point Field Goal Percentage
H_2P : Home 2-Point Field Goals
H_2PA : Home 2-point Field Goal Attempts
H_2P% : Home 2-Point Field Goal Percentage
H_FT : Home Free Throws
H_FTA : Home Free Throw Attempts
H_FT% : Home Free Throw Percentage
H_ORB : Home Offensive Rebounds
H_DRB : Home Defensive Rebounds
H_TRB : Home Total Rebounds
H_AST : Home Assists
H_STL : Home Steals
H_BLK : Home Blocks
H_TOV : Home Turnovers
H_PF : Home Personal Fouls
H_PTS : Home Points
A_Rk : Away Rank
A_Team : Away Team
A_G : Away Games
A_MP : Away Minutes Played
A_FG : Away Field Goals
A_FGA : Away Field Goal Attempts
A_FG% : Away Field Goal Percentage
A_3P : Away 3-Point Field Goals
A_3PA : Away 3-Point Field Goal Attempts
A_3P% : Away 3-Point Fie

In [3]:
data_df = pd.read_csv('../Data/data.csv')
data_df.head()

Unnamed: 0,Date,H_Team,H_ave3P,H_ave3PA,H_ave2P,H_ave2PA,H_aveFT,H_aveFTA,H_aveORB,H_aveDRB,H_aveAST,H_aveSTL,H_aveBLK,H_aveTOV,H_avePF,H_Elo,A_Team,A_ave3P,A_ave3PA,A_ave2P,A_ave2PA,A_aveFT,A_aveFTA,A_aveORB,A_aveDRB,A_aveAST,A_aveSTL,A_aveBLK,A_aveTOV,A_avePF,A_Elo,H_Dif
0,"July 20, 2021",MIL,12.0,32.0,30.8,59.0,15.4,22.2,13.6,31.4,23.6,7.6,3.0,9.4,17.6,1709.957578,PHO,12.0,29.4,30.4,55.4,14.8,17.2,7.2,32.2,21.2,6.2,4.0,12.0,20.2,1700.119737,7
1,"July 17, 2021",PHO,12.8,31.8,30.6,54.4,14.2,17.2,7.6,34.2,22.0,5.4,3.8,12.2,20.4,1705.763269,MIL,12.6,35.4,28.4,56.0,17.4,23.0,13.8,33.0,23.2,8.4,3.8,10.0,18.8,1704.314046,-4
2,"July 14, 2021",MIL,13.0,35.4,29.8,55.6,16.2,22.2,13.2,33.4,24.4,8.4,4.2,10.6,18.6,1694.99095,PHO,13.2,32.4,29.8,55.0,14.4,17.4,8.0,34.0,22.8,6.0,2.2,11.6,19.4,1715.086366,6
3,"July 11, 2021",MIL,11.8,36.0,29.0,53.8,15.0,20.8,13.0,32.8,22.6,8.0,4.4,11.8,18.6,1689.076362,PHO,12.2,30.2,29.0,57.8,15.8,18.4,10.2,36.4,21.8,6.2,3.2,11.6,20.0,1721.000954,20
4,"July 8, 2021",PHO,10.2,28.6,29.4,59.8,15.8,18.2,10.2,36.4,21.4,5.8,3.4,11.2,20.0,1735.055895,MIL,12.2,35.6,29.2,53.2,14.4,20.4,12.4,34.2,23.8,8.0,4.0,12.2,18.8,1675.02142,10


In [4]:
data_df.shape

(6247, 32)

In [5]:
# duplicate?
data_df.index.duplicated().sum()

0

In [6]:
# Cột output có bao nhiêu giá trị thiếu?
data_df['H_Dif'].isna().sum()

0

# DELETE

In [7]:
data_df['H_Dif'] = data_df.apply(lambda x: x['H_Dif'] > 0, axis=1)

### Split data

In [8]:
# Tách X và y
y_sr = data_df["H_Dif"] # sr là viết tắt của series
X_df = data_df.drop("H_Dif", axis=1)

In [9]:
# Split data_set and test_set (85-15)
data_X_df, test_X_df, data_y_sr, test_y_sr = train_test_split(X_df, y_sr, 
                                               test_size=0.15, 
                                               random_state=0)

# Split train_set and valid_set from data set (85-15)
train_X_df, valid_X_df, train_y_sr, valid_y_sr = train_test_split(data_X_df, data_y_sr, 
                                               test_size=0.15,
                                               random_state=0)

In [10]:
print(train_X_df.shape)
print(valid_X_df.shape)
print(test_X_df.shape)

(4512, 31)
(797, 31)
(938, 31)


### Explore train data

In [11]:
train_X_df.dtypes

Date         object
H_Team       object
H_ave3P     float64
H_ave3PA    float64
H_ave2P     float64
H_ave2PA    float64
H_aveFT     float64
H_aveFTA    float64
H_aveORB    float64
H_aveDRB    float64
H_aveAST    float64
H_aveSTL    float64
H_aveBLK    float64
H_aveTOV    float64
H_avePF     float64
H_Elo       float64
A_Team       object
A_ave3P     float64
A_ave3PA    float64
A_ave2P     float64
A_ave2PA    float64
A_aveFT     float64
A_aveFTA    float64
A_aveORB    float64
A_aveDRB    float64
A_aveAST    float64
A_aveSTL    float64
A_aveBLK    float64
A_aveTOV    float64
A_avePF     float64
A_Elo       float64
dtype: object

In [12]:
def remove_object_col(X):
    return X.select_dtypes(exclude=['object'])

### Random Forest Model

In [15]:
random_forest_model = RandomForestClassifier(random_state=0,
                                             oob_score = True,
                                             n_jobs = -1)

# Number of trees in random forest
n_estimators = list(range(100,1000,200))
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# comment
min_samples_leaf = list(range(10,100,10))

RFparam_grid = {
    'randomforestclassifier__n_estimators': n_estimators,
    'randomforestclassifier__max_features': max_features,
}

full_pipeline = make_pipeline(FunctionTransformer(remove_object_col),
                              StandardScaler(),
                              random_forest_model)

gs = GridSearchCV(estimator = full_pipeline, 
                  param_grid = RFparam_grid, 
                  scoring = 'accuracy', 
                  cv = StratifiedKFold(n_splits=5),
                  return_train_score = True)

gs.fit(data_X_df, data_y_sr)
gs.predict(test_X_df)
gs.score(test_X_df, test_y_sr)

0.6833688699360341

In [18]:
random_forest_model = RandomForestClassifier(random_state=0,
                                             oob_score = True,
                                             n_jobs = -1,
                                             min_samples_leaf = 50)
# Number of trees in random forest
n_estimators = list(range(100,1000,200))
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# comment
min_samples_leaf = list(range(10,100,10))

RFparam_grid = {
    'randomforestclassifier__n_estimators': n_estimators,
    'randomforestclassifier__max_features': max_features,
    'randomforestclassifier__min_samples_leaf': min_samples_leaf,
}

full_pipeline = make_pipeline(FunctionTransformer(remove_object_col),
                              StandardScaler(),
                              random_forest_model)

gs = RandomizedSearchCV(estimator = full_pipeline, 
                  param_distributions = RFparam_grid, 
                  scoring = 'accuracy', 
                  cv = StratifiedKFold(n_splits=5),
                  return_train_score = True,
                  n_iter = 100)

gs.fit(data_X_df, data_y_sr)
gs.predict(test_X_df)
gs.score(test_X_df, test_y_sr)

0.6865671641791045

### Logistic Regression

In [17]:
logistic_regression_model = LogisticRegression(random_state=0)

LRparam_grid = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'logisticregression__max_iter': list(range(100,800,100)),
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

full_pipeline = make_pipeline(FunctionTransformer(remove_object_col),
                              StandardScaler(), 
                              logistic_regression_model)

gs = GridSearchCV(estimator = full_pipeline, 
                  param_grid = LRparam_grid, 
                  scoring = 'accuracy', 
                  cv = StratifiedKFold(n_splits=5),
                  return_train_score=True)
gs.fit(data_X_df, data_y_sr)
gs.predict(test_X_df)
gs.score(test_X_df, test_y_sr)

0.7089552238805971

In [19]:
gs.best_params_

{'randomforestclassifier__n_estimators': 900,
 'randomforestclassifier__min_samples_leaf': 10,
 'randomforestclassifier__max_features': 'auto'}

### Neural Network 

In [21]:
%timeit
neural_network_model = MLPClassifier(random_state=0)

GRID = [
    {'mlpclassifier__max_iter': [100, 300],
     'mlpclassifier__alpha': [0.0001, 0.001, 0.005]}
]

full_pipeline = make_pipeline(FunctionTransformer(remove_object_col),
                              StandardScaler(), 
                              MLPClassifier())

gs = GridSearchCV(estimator = full_pipeline, 
                  param_grid=GRID, 
                  scoring='accuracy', 
                  cv = StratifiedKFold(n_splits=5),
                  return_train_score=True)

gs.fit(data_X_df, data_y_sr)
gs.predict(test_X_df)
gs.score(test_X_df, test_y_sr)



0.6673773987206824

### SVM

date = input()
H_Team = input()
A_Team = input()

read raw_data => find 5 recent games => row => df(1 row)
col_name = row

=> predict H_dif = ?

crawl data ss 2021

Full version ipynb (check lib) min_ds-env 
// Crawl data => Preprocess => Best Model score