In [6]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import wrapper

from feature_engine.selection import SmartCorrelatedSelection
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
list_column = ['previousHomeWin', 'previousAwayWin', 'previousDraw',
       'previousManagerHomeWin', 'previousManagerAwayWin',
       'previousManagerDraw', 'homeAvgRating', 'homePosition', 'homePoint',
       'homeForm', 'awayAvgRating', 'awayPosition', 'awayPoint', 'awayForm',
       'homeExpectedGoal', 'awayExpectedGoal', 'homeExpectedAssist',
       'awayExpectedAssist', 'homeBallPosession', 'homeShotOnTarget',
       'awayShotOnTarget', 'homeShotOffTarget', 'awayShotOffTarget',
       'homeBlockedShot', 'awayBlockedShot', 'homeCorner', 'awayCorner',
       'homeOffside', 'awayOffside', 'homeYellowCard', 'awayYellowCard',
       'homeRedCard', 'awayRedCard', 'homeFreekick', 'awayFreekick',
       'homeThrowIn', 'awayThrowIn', 'homeGoalkick', 'awayGoalkick',
       'homeBigChance', 'awayBigChance', 'homeBigChanceMissed',
       'awayBigChanceMissed', 'homeHitWoodwork', 'awayHitWoodwork',
       'homeCounterAttack', 'awayCounterAttack', 'homeCounterAttackShot',
       'awayCounterAttackShot', 'homeCounterAttackGoal',
       'awayCounterAttackGoal', 'homeShotInsideBox', 'awayShotInsideBox',
       'homeGoalSave', 'awayGoalSave', 'homePass', 'awayPass',
       'homeAccuratePass', 'awayAccuratePass', 'homeLongPass', 'awayLongPass',
       'homeAccurateLongPass', 'awayAccurateLongPass', 'homeCross',
       'awayCross', 'homeAccurateCross', 'awayAccurateCross', 'homeDribble',
       'awwayDribble', 'homeSuccessfulDribble', 'awaySuccessfulDribble',
       'homePossesionLost', 'awayPossesionLost', 'homeDuelWon', 'awayDuelWon',
       'homeAerialWon', 'awayAerialWon', 'homeTackle', 'awayTackle',
       'homeInterception', 'awayInterception', 'homeClearance',
       'awayClearance', 'homeTeamId', 'awayTeamId', 'homeScorePeriod1',
       'homeScoreCurrent', 'awayScorePeriod1', 'awayScoreCurrent']

X_train, X_test, y_train, y_test = wrapper.load_fillna("match_all_statistic.csv", list_column=list_column)

In [7]:
sel = SmartCorrelatedSelection(threshold=0.9, selection_method="variance")
sel.fit(X_train, y_train)

X_train_fe_variance = sel.transform(X_train)
X_test_fe_variance = sel.transform(X_test)

sc = QuantileTransformer()
X_train_scaler = sc.fit_transform(X_train_fe_variance)
X_test_scaler = sc.transform(X_test_fe_variance)

model = AdaBoostClassifier()

# parameters = {"estimator": (LogisticRegression(), DecisionTreeClassifier(max_depth=1)), 
#               "n_estimators": (20, 30, 50, 70, 100),
#               "learning_rate": (0.2, 0.4, 0.6, 0.8, 1.0)}

model.fit(X_train_scaler, y_train)

predict = model.predict(X_test_scaler)

print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.85      0.74      0.79        23
           1       0.74      0.94      0.83        18
           3       0.91      0.86      0.89        37

    accuracy                           0.85        78
   macro avg       0.83      0.85      0.84        78
weighted avg       0.85      0.85      0.85        78



In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

scaler1 = QuantileTransformer()
scaler1.fit(X_train)

X_train_scaler = scaler1.transform(X_train)
X_test_scaler = scaler1.transform(X_test)

scaler2 = StandardScaler()
scaler2.fit(X_train_scaler)

X_train_scaler = scaler2.transform(X_train_scaler)
X_test_scaler = scaler2.transform(X_test_scaler)

clf = LogisticRegression(C=1000, penalty='l2', max_iter=300, random_state=10)

sel = SelectFromModel(clf)

sel.fit(X_train_scaler, y_train)

X_train_sfm = sel.transform(X_train_scaler)
X_test_sfm = sel.transform(X_test_scaler)

parameters = {"C": (1, 5, 10, 50, 100), 
              "class_weight": (None, "balanced"),
              "max_iter": (100, 300, 500, 700, 1000)}

# clf = LogisticRegression(C=1, penalty='l2', max_iter=300, random_state=10)
clf = GridSearchCV(LogisticRegression(), parameters, scoring="accuracy", cv=3, verbose=3)
clf.fit(X_train_sfm, y_train)

predict = clf.predict(X_test_sfm)

print(clf.best_params_)
print(classification_report(y_test, predict))


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV 1/3] END C=1, class_weight=None, max_iter=100;, score=0.760 total time=   0.0s
[CV 2/3] END C=1, class_weight=None, max_iter=100;, score=0.822 total time=   0.0s
[CV 3/3] END C=1, class_weight=None, max_iter=100;, score=0.842 total time=   0.0s
[CV 1/3] END C=1, class_weight=None, max_iter=300;, score=0.760 total time=   0.0s
[CV 2/3] END C=1, class_weight=None, max_iter=300;, score=0.822 total time=   0.0s
[CV 3/3] END C=1, class_weight=None, max_iter=300;, score=0.842 total time=   0.0s
[CV 1/3] END C=1, class_weight=None, max_iter=500;, score=0.760 total time=   0.0s
[CV 2/3] END C=1, class_weight=None, max_iter=500;, score=0.822 total time=   0.0s
[CV 3/3] END C=1, class_weight=None, max_iter=500;, score=0.842 total time=   0.0s
[CV 1/3] END C=1, class_weight=None, max_iter=700;, score=0.760 total time=   0.0s
[CV 2/3] END C=1, class_weight=None, max_iter=700;, score=0.822 total time=   0.0s
[CV 3/3] END C=1, class_w

In [21]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.svm import LinearSVC

sel = SelectKBest(f_classif, k=53)
sel.fit(X_train, y_train)

X_train_f = sel.transform(X_train)
X_test_f = sel.transform(X_test)

scaler1 = QuantileTransformer()
scaler1.fit(X_train_f)

X_train_scaler = scaler1.transform(X_train_f)
X_test_scaler = scaler1.transform(X_test_f)

scaler2 = StandardScaler()
scaler2.fit(X_train_scaler)

X_train_scaler = scaler2.transform(X_train_scaler)
X_test_scaler = scaler2.transform(X_test_scaler)

model = LinearSVC()

parameters = {"C": (0.1, 1, 5, 10, 50, 100), 
              "max_iter": (100, 300, 500, 700, 1000, 2000)}

clf = GridSearchCV(LinearSVC(), parameters, scoring="accuracy", cv=2, verbose=3)

clf.fit(X_train_sfm, y_train)

predict = clf.predict(X_test_sfm)

print(clf.best_params_)
print(classification_report(y_test, predict))

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV 1/2] END ...............C=0.1, max_iter=100;, score=0.726 total time=   0.0s
[CV 2/2] END ...............C=0.1, max_iter=100;, score=0.785 total time=   0.0s
[CV 1/2] END ...............C=0.1, max_iter=300;, score=0.726 total time=   0.0s
[CV 2/2] END ...............C=0.1, max_iter=300;, score=0.785 total time=   0.0s
[CV 1/2] END ...............C=0.1, max_iter=500;, score=0.726 total time=   0.0s
[CV 2/2] END ...............C=0.1, max_iter=500;, score=0.785 total time=   0.0s
[CV 1/2] END ...............C=0.1, max_iter=700;, score=0.726 total time=   0.0s
[CV 2/2] END ...............C=0.1, max_iter=700;, score=0.785 total time=   0.0s
[CV 1/2] END ..............C=0.1, max_iter=1000;, score=0.726 total time=   0.0s
[CV 2/2] END ..............C=0.1, max_iter=1000;, score=0.785 total time=   0.0s
[CV 1/2] END ..............C=0.1, max_iter=2000;, score=0.726 total time=   0.0s
[CV 2/2] END ..............C=0.1, max_iter=2000;

In [26]:
scaler1 = QuantileTransformer()
scaler1.fit(X_train)

X_train_scaler = scaler1.transform(X_train)
X_test_scaler = scaler1.transform(X_test)

scaler2 = StandardScaler()
scaler2.fit(X_train_scaler)

X_train_scaler = scaler2.transform(X_train_scaler)
X_test_scaler = scaler2.transform(X_test_scaler)

clf = RandomForestClassifier()

sel = SelectFromModel(clf)

sel.fit(X_train_scaler, y_train)

X_train_sfm = sel.transform(X_train_scaler)
X_test_sfm = sel.transform(X_test_scaler)

parameters = {"n_estimators": (20, 50, 100, 200), 
              "max_depth": (None, 5, 10, 20)}

# clf = LogisticRegression(C=1, penalty='l2', max_iter=300, random_state=10)
clf = GridSearchCV(RandomForestClassifier(), parameters, scoring="accuracy", cv=3, verbose=3)
clf.fit(X_train_sfm, y_train)

print(clf.score(X_train_sfm, y_train))

predict = clf.predict(X_test_sfm)

print(clf.best_params_)
print(classification_report(y_test, predict))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV 1/3] END ...max_depth=None, n_estimators=20;, score=0.719 total time=   0.0s
[CV 2/3] END ...max_depth=None, n_estimators=20;, score=0.760 total time=   0.0s
[CV 3/3] END ...max_depth=None, n_estimators=20;, score=0.740 total time=   0.0s
[CV 1/3] END ...max_depth=None, n_estimators=50;, score=0.747 total time=   0.0s
[CV 2/3] END ...max_depth=None, n_estimators=50;, score=0.795 total time=   0.0s
[CV 3/3] END ...max_depth=None, n_estimators=50;, score=0.767 total time=   0.0s
[CV 1/3] END ..max_depth=None, n_estimators=100;, score=0.781 total time=   0.1s
[CV 2/3] END ..max_depth=None, n_estimators=100;, score=0.747 total time=   0.1s
[CV 3/3] END ..max_depth=None, n_estimators=100;, score=0.767 total time=   0.1s
[CV 1/3] END ..max_depth=None, n_estimators=200;, score=0.760 total time=   0.4s
[CV 2/3] END ..max_depth=None, n_estimators=200;, score=0.795 total time=   0.4s
[CV 3/3] END ..max_depth=None, n_estimators=200;

In [1]:
from feature_selection import ModelSelector

selector = ModelSelector(seed=42)
selector.fit()

Start Correlation Feature Selector

Start Statistical Feature Selector

Start Wrapper Feature Selector

Start Feature Importance Selector

Start Recursive Feature Addition Selector

Finish!!! Check the result in result.txt



In [1]:
from feature_selection import ModelSelector

selector = ModelSelector(seed=100, outfile="result_100_.txt")
selector.fit()

Start Wrapper Feature Selector

Start Feature Importance Selector

Start Recursive Feature Addition Selector

Finish!!! Check the result in result_100_.txt



In [5]:
from feature_selection import ModelSelector

selector = ModelSelector(seed=2023)
selector.fit_all()

Finish!!! Check the result in result_2023.txt



In [48]:
from wrapper import load_fillna_all
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SequentialFeatureSelector as SFS

X, y = load_fillna_all()

scaler1 = QuantileTransformer()
X = scaler1.fit_transform(X)

sfs = SFS(model, 
        n_features_to_select=9,
        tol=None,
        direction='forward',
        scoring='accuracy',
        cv=10,
        n_jobs=3
    )
sfs1 = sfs.fit(X, y)

X_train_sfs = sfs1.transform(X) 

model = LogisticRegression()
parameter = {'C': (0.01, 0.02, 0.05, 0.1, 0.2, 1, 5, 10, 50), 'max_iter': (100, 200, 300, 500, 700, 1000), "random_state": (0, 42, 100, 1234, 2023)}

grid = GridSearchCV(estimator=model, param_grid=parameter, scoring="accuracy", cv=10)

grid_result = grid.fit(X_train_sfs, y.values.ravel())




In [4]:
from wrapper import load_fillna
X_train, X_test, y_train, y_test = load_fillna()

In [7]:
y_train.shape

(438, 1)

In [49]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

Best: 0.835407 using {'C': 1, 'max_iter': 100, 'random_state': 0}


In [50]:
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 100, 'random_state': 0}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 100, 'random_state': 42}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 100, 'random_state': 100}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 100, 'random_state': 1234}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 100, 'random_state': 2023}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 200, 'random_state': 0}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 200, 'random_state': 42}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 200, 'random_state': 100}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 200, 'random_state': 1234}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 200, 'random_state': 2023}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 300, 'random_state': 0}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 300, 'random_state': 42}
0.579449 (0.025780) with: {'C': 0.01, 'max_iter': 300, 'random_state': 100}
0.579449 (0.02578

In [39]:
grid_result.best_estimator_

In [46]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([("quantile", scaler1), ("sel", sfs1), ("model", grid_result.best_estimator_)])

import pickle

pickle.dump(pipe, open("pipeline.pkl", "wb"))

In [51]:
X_, y_ = load_fillna_all()

pipe = Pipeline([("quantile", scaler1), ("sel", sfs1), ("model", grid_result.best_estimator_)])

pipe.score(X_, y_.values.ravel())

0.8410852713178295

In [None]:
X, y = load_fillna_all()
