In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.feature_selection import RFECV
import xgboost as xgb
import lightgbm as lgb

In [2]:
train_df = pd.read_csv('./preprocessed_data/train_set.csv')
test_df = pd.read_csv('./preprocessed_data/test_set.csv')

In [3]:
# 전처리 

In [4]:
# 1. null 체크
# 앞에서 데이터를 합치는 과정에서 null값은 다 처리

In [5]:
# 2. 이상치 제거
# IQR 기반으로 데이터를 삭제하면 너무 많은 컬럼들이 삭제되므로 Z-Score를 기준으로 확인하여 삭제하기로 결정

In [6]:
distribution_check_col = ['B365HW', 'B365D', 'B365AW', 'total_games_x', 'wins_x', 'win_percentage_x', 
                          'total_games_vs_opponent_x', 'wins_vs_opponent_x', 'win_percentage_vs_opponent_x', 
                          'TY_x', 'OY_x', 'TR_x', 'OR_x', 'Possesion_x', 'Aerial Duels(%)_x', 'GF_x', 'GA_x', 
                          'Shot on Target_x', 'Shot on Target(%)_x', 'Goals per Shot_x', 'Expected Goals_x', 
                          'Save%_x', 'Clean Sheet_x', 'Pass Completion %_x', 'Assists_x', 'Exp. Assisted Goals_x', 
                          'Expected Assists_x', 'Tackles Won_x', '% of Dribblers Tackled_x', 'Blocks_x', 
                          'Interceptions_x', 'Error_x', 'market_value_in_eur_x', 'total_games_y', 'wins_y', 'win_percentage_y',
                          'total_games_vs_opponent_y', 'wins_vs_opponent_y', 'win_percentage_vs_opponent_y', 
                          'TY_y', 'OY_y', 'TR_y', 'OR_y', 'Possesion_y', 'Aerial Duels(%)_y', 'GF_y', 'GA_y', 'Shot on Target_y', 
                          'Shot on Target(%)_y', 'Goals per Shot_y', 'Expected Goals_y', 'Save%_y', 'Clean Sheet_y', 
                          'Pass Completion %_y', 'Assists_y', 'Exp. Assisted Goals_y', 'Expected Assists_y', 'Tackles Won_y', 
                          '% of Dribblers Tackled_y', 'Blocks_y', 'Interceptions_y', 'Error_y', 'market_value_in_eur_y']

In [7]:
# z-score를 계산하여 이상치가 있는지 확인하고 이상치를 제외한 데이터 반환
def remove_outliers(df, threshold):
    z_scores = (df - df.mean()) / df.std()
    no_outliers_df = df[(z_scores <= threshold) & (z_scores >= -threshold)]
    return no_outliers_df

In [8]:
# z-score 3, 4 의 경우 많은 수의 데이터를 삭제해야 해서 z-score 5를 기준으로 하여 이상치 제거
train_df[distribution_check_col] = train_df[distribution_check_col].apply(lambda x: remove_outliers(x, 5))

In [9]:
train_df = train_df.dropna()

In [10]:
# 15개 행 삭제됨
train_df

Unnamed: 0,game_id,club_id_x,club_id_y,B365HW,B365D,B365AW,Table_x,Table_y,manager_name_x,manager_name_y,...,Pass Completion %_y,Assists_y,Exp. Assisted Goals_y,Expected Assists_y,Tackles Won_y,% of Dribblers Tackled_y,Blocks_y,Interceptions_y,Error_y,market_value_in_eur_y
0,3050217,11,29,69.444444,20.000000,13.333333,6,12,82,40,...,77.633333,1.333333,0.766667,0.866667,12.333333,50.566667,9.666667,10.000000,0.666667,1.792857e+07
1,3050227,11,1010,69.444444,19.607843,14.285714,5,6,82,32,...,65.400000,1.333333,0.766667,0.733333,12.000000,52.166667,13.000000,16.000000,0.333333,7.142857e+06
2,3050247,11,1003,65.359477,22.222222,15.384615,4,11,82,13,...,78.266667,1.333333,1.000000,0.900000,12.333333,47.633333,13.333333,9.333333,0.333333,1.522344e+07
3,3050267,11,31,25.641026,25.641026,51.282051,5,3,82,36,...,82.333333,1.000000,0.800000,0.800000,8.666667,49.700000,10.666667,8.000000,0.666667,4.684615e+07
4,3050277,11,543,60.240964,23.809524,19.047619,5,11,82,53,...,78.300000,0.000000,0.900000,0.733333,9.666667,39.900000,10.333333,13.000000,0.000000,1.277381e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2171,4095279,1237,543,60.606061,23.094688,20.000000,7,11,67,25,...,72.400000,2.000000,1.200000,0.900000,13.000000,52.166667,12.666667,10.000000,0.666667,1.643611e+07
2172,4095299,1237,873,58.823529,25.000000,21.052632,8,14,67,68,...,77.433333,1.666667,0.733333,0.833333,10.666667,56.933333,11.333333,7.666667,0.000000,1.926758e+07
2173,4095333,1237,29,54.644809,26.315789,23.809524,7,15,67,73,...,73.733333,0.666667,1.200000,0.766667,15.000000,59.133333,13.333333,13.666667,0.000000,1.680149e+07
2174,4095353,1237,703,55.555556,25.000000,25.000000,8,17,67,53,...,75.666667,1.333333,1.866667,0.900000,10.333333,57.000000,14.666667,8.000000,0.333333,1.668189e+07


In [11]:
# 표준화

In [12]:
# 4. log 스케일링

In [13]:
# 값의 단위가 크고 약갼 좌측편향 되었던 'market_value_in_eur_x', 'market_value_in_eur_y' 2개의 컬럼만 log 스케일링 하기

skewed_col = ['market_value_in_eur_x', 'market_value_in_eur_y']

In [14]:
train_df[skewed_col] = np.log1p(train_df[skewed_col])
test_df[skewed_col] = np.log1p(test_df[skewed_col])

In [15]:
# 5. minmax scaling

In [16]:
# scaling할 컬럼

col_to_minmax = ['B365HW', 'B365D', 'B365AW',
       'total_games_x', 'wins_x',
       'win_percentage_x', 'total_games_vs_opponent_x', 'wins_vs_opponent_x',
       'win_percentage_vs_opponent_x', 'TY_x', 'OY_x', 'TR_x', 'OR_x',
       'Possesion_x', 'Aerial Duels(%)_x', 'GF_x', 'GA_x', 'Shot on Target_x',
       'Shot on Target(%)_x', 'Goals per Shot_x', 'Expected Goals_x',
       'Save%_x', 'Clean Sheet_x', 'Pass Completion %_x', 'Assists_x',
       'Exp. Assisted Goals_x', 'Expected Assists_x', 'Tackles Won_x',
       '% of Dribblers Tackled_x', 'Blocks_x', 'Interceptions_x', 'Error_x',
       'total_games_y', 'wins_y', 'win_percentage_y',
       'total_games_vs_opponent_y', 'wins_vs_opponent_y',
       'win_percentage_vs_opponent_y', 'TY_y', 'OY_y', 'TR_y', 'OR_y',
       'Possesion_y', 'Aerial Duels(%)_y', 'GF_y', 'GA_y', 'Shot on Target_y',
       'Shot on Target(%)_y', 'Goals per Shot_y', 'Expected Goals_y',
       'Save%_y', 'Clean Sheet_y', 'Pass Completion %_y', 'Assists_y',
       'Exp. Assisted Goals_y', 'Expected Assists_y', 'Tackles Won_y',
       '% of Dribblers Tackled_y', 'Blocks_y', 'Interceptions_y', 'Error_y']

In [17]:
# MinMaxScaler 객체 생성
scaler = MinMaxScaler()

In [18]:
# 선택한 컬럼에 대해 스케일링 적용
scaler.fit(train_df[col_to_minmax])

In [19]:
# train_df 스케일링
train_df[col_to_minmax] = scaler.transform(train_df[col_to_minmax])

In [20]:
# test_df 스케일링
test_df[col_to_minmax] = scaler.transform(test_df[col_to_minmax])

In [21]:
independent_variable = ['club_id_x', 'club_id_y', 'B365HW', 'B365D', 'B365AW',
       'Table_x', 'Table_y', 'manager_name_x', 'manager_name_y',
       'home_team_id', 'Referee', 'total_games_x', 'wins_x',
       'win_percentage_x', 'total_games_vs_opponent_x', 'wins_vs_opponent_x',
       'win_percentage_vs_opponent_x', 'TY_x', 'OY_x', 'TR_x', 'OR_x',
       'Possesion_x', 'Aerial Duels(%)_x', 'GF_x', 'GA_x', 'Shot on Target_x',
       'Shot on Target(%)_x', 'Goals per Shot_x', 'Expected Goals_x',
       'Save%_x', 'Clean Sheet_x', 'Pass Completion %_x', 'Assists_x',
       'Exp. Assisted Goals_x', 'Expected Assists_x', 'Tackles Won_x',
       '% of Dribblers Tackled_x', 'Blocks_x', 'Interceptions_x', 'Error_x',
       'market_value_in_eur_x', 'total_games_y', 'wins_y', 'win_percentage_y',
       'total_games_vs_opponent_y', 'wins_vs_opponent_y',
       'win_percentage_vs_opponent_y', 'TY_y', 'OY_y', 'TR_y', 'OR_y',
       'Possesion_y', 'Aerial Duels(%)_y', 'GF_y', 'GA_y', 'Shot on Target_y',
       'Shot on Target(%)_y', 'Goals per Shot_y', 'Expected Goals_y',
       'Save%_y', 'Clean Sheet_y', 'Pass Completion %_y', 'Assists_y',
       'Exp. Assisted Goals_y', 'Expected Assists_y', 'Tackles Won_y',
       '% of Dribblers Tackled_y', 'Blocks_y', 'Interceptions_y', 'Error_y',
       'market_value_in_eur_y']

In [23]:
# 앞에서 feature selection으로 선택한 feature
selected_feature = ['Table_x', 'Table_y', 'B365HW', 'B365AW', 'Tackles Won_y', 'market_value_in_eur_x', 'B365D', 'Expected Goals_x']

In [24]:
validation_set_game_id_list = [3838710, 3838704, 3838708, 3838706, 3838707, 3838702, 3838709,
       3838703, 3838701, 3838705, 3837992, 3837993, 3837988, 3838001,
       3837997, 3838747, 3838691, 3838694, 3838699, 3838692, 3838693,
       3838696, 3838698, 3838700, 3838697, 3838695, 3838675, 3838684,
       3838681, 3838679, 3838676, 3838678, 3838683, 3838677, 3838680,
       3838682, 3838674, 3838668, 3838672, 3838667, 3838670, 3838669,
       3838665, 3838673, 3838656, 3838653, 3838658, 3838649, 3838655,
       3838654, 3838651, 3838650, 3838652, 3838657, 3838645, 3838644,
       3838646, 3838640, 3838647, 3838643, 3838641, 3838639, 3838642,
       3838648, 3838716, 3838717, 3838714, 3838623, 3838631, 3838629,
       3838626, 3838627, 3838630, 3838628, 3838625, 3838632, 3838624,
       3838616, 3838621, 3838617, 3838622, 3838614, 3838619, 3838613,
       3838615, 3838618, 3838620, 3838748, 3838612, 3838603, 3838606,
       3838609, 3838605, 3838610, 3838607, 3838604, 3838611, 3838608,
       3838671, 3838666, 3838594, 3838597, 3838595, 3838602, 3838593,
       3838600, 3838598, 3838599, 3838601, 3838596, 4087924, 4087928,
       4087925, 4087926, 4087930, 4087927, 4087929, 4087932, 4087931,
       4087933, 4087940, 4087941, 4087936, 4087939, 4087937, 4087943,
       4087942, 4087934, 4087935, 4087949, 4087944, 4087951, 4087947,
       4087950, 4087946, 4087945, 4087953, 4087952, 4087948, 4087961,
       4087955, 4087958, 4087956, 4087957, 4087963, 4087962, 4087959,
       4087954, 4087960, 4087972, 4087968, 4087970, 4087973, 4087971,
       4087965, 4087967, 4087966, 4087964, 4087969, 4095069, 4095065,
       4095063, 4095067, 4095070, 4095068, 4095062, 4095066, 4095071,
       4095064, 4095140, 4095137, 4095136, 4095138, 4095143, 4095145,
       4095144, 4095142, 4095141, 4095139, 4087938, 4095151, 4095153,
       4095149, 4095148, 4095152, 4095150, 4095147, 4095155, 4095154,
       4095146, 4095158, 4095160, 4095156, 4095159, 4095162, 4095164,
       4095161, 4095163, 4095157, 4095165, 4095171, 4095170, 4095166,
       4095167, 4095175, 4095174, 4095172, 4095168, 4095169, 4095173,
       4095176, 4095181, 4095184, 4095177, 4095178, 4095179, 4095182,
       4095180, 4095183, 4095185, 4095187, 4095191, 4095193, 4095195,
       4095186, 4095190, 4095189, 4095188, 4095194, 4095192, 4095197,
       4095203, 4095196, 4095201, 4095202, 4095204, 4095200, 4095205,
       4095198, 4095199, 4095207, 4095214, 4095213, 4095208, 4095209,
       4095210, 4095206, 4095211, 4095215, 4095212, 4095223, 4095220,
       4095217, 4095224, 4095221, 4095219, 4095225, 4095216, 4095222,
       4095218, 4095235, 4095233, 4095227, 4095228, 4095226, 4095232,
       4095229, 4095231, 4095234, 4095230, 4095244, 4095239, 4095242,
       4095240, 4095243, 4095241, 4095237, 4095245, 4095238, 4095247,
       4095246, 4095252, 4095249, 4095250, 4095254, 4095248, 4095253,
       4095255, 4095256, 4095264, 4095260, 4095263, 4095265, 4095258,
       4095262, 4095261, 4095259, 4095257, 4095267, 4095275, 4095270,
       4095272, 4095271, 4095266, 4095268, 4095273, 4095269, 4095274,
       4095280, 4095281, 4095284, 4095283, 4095282, 4095277, 4095278,
       4095285, 4095276, 4095279, 4095288, 4095286, 4095289, 4095293,
       4095287, 4095295, 4095294, 4095290, 4095291, 4095292, 4095304,
       4095305, 4095302, 4095299, 4095300, 4095296, 4095301, 4095303,
       4095297, 4095298, 4095312, 4095316, 4095315, 4095313, 4095319,
       4095314, 4095317, 4095310, 4095318, 4095311, 4095320, 4095323,
       4095329, 4095327, 4095326, 4095321, 4095325, 4095324, 4095328,
       4095322, 4095251, 4095336, 4095331, 4095333, 4095335, 4095337,
       4095332, 4095330, 4095339, 4095338, 4095343, 4095347, 4095344,
       4095340, 4095342, 4095346, 4095349, 4095345, 4095341, 4095348,
       4095359, 4095355, 4095350, 4095357, 4095351, 4095358, 4095353,
       4095356, 4095352, 4095354, 4095236, 4095366, 4095362, 4095365,
       4095368, 4095373, 4095379, 4095377, 4095372, 4095378, 4095370,
       4095376, 4095371, 4095375, 4095374, 4095383, 4095384, 4095380,
       4095385, 4095387, 4095389, 4095381, 4095382, 4095386, 4095388,
       4095390, 4095395, 4095399, 4095393, 4095391, 4095392, 4095394,
       4095398, 4095396, 4095397]

In [25]:
validation_df = train_df[train_df['game_id'].isin(validation_set_game_id_list)]

In [26]:
final_train_df = train_df[~train_df['game_id'].isin(validation_set_game_id_list)]

In [40]:
X_train = final_train_df[independent_variable]
y_train = final_train_df['Result']

X_val = validation_df[independent_variable]
y_val = validation_df['Result']

X_test = test_df[independent_variable]
y_test = test_df['Result']

### test set 결과 확인

In [41]:
gb_model = GradientBoostingClassifier(n_estimators=70, max_depth=1, random_state=42)
gb_model.fit(X_train, y_train)

# validation 데이터로 예측
y_pred = gb_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.6144578313253012
F1 Score: 0.564654555971181


In [42]:
# test 데이터로 예측
y_pred2 = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred2)
f1 = f1_score(y_test, y_pred2, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5454545454545454
F1 Score: 0.5296442687747036


In [43]:
gb_model = GradientBoostingClassifier(n_estimators=70, max_depth=1, random_state=42)
gb_model.fit(X_train[selected_feature], y_train)

# validation 데이터로 예측
y_pred = gb_model.predict(X_val[selected_feature])
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.6289156626506024
F1 Score: 0.5669897927162024


In [44]:
# test 데이터로 예측
y_pred2 = gb_model.predict(X_test[selected_feature])
accuracy = accuracy_score(y_test, y_pred2)
f1 = f1_score(y_test, y_pred2, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5454545454545454
F1 Score: 0.5189490816089282


In [48]:
y_pred2.tolist()

[1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 0, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1]

In [49]:
test_df['Result'].tolist()

[2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 0, 1, 1, 1, 2, 1, 0, 2, 0, 1]

In [50]:
test_df['predict'] = y_pred2 

In [51]:
test_df['Result'] == test_df['predict']

0     False
1      True
2      True
3     False
4     False
5      True
6      True
7     False
8      True
9      True
10    False
11     True
12    False
13     True
14    False
15    False
16     True
17     True
18    False
19     True
20    False
21     True
dtype: bool

In [52]:
test_df['YN'] = np.where(test_df['Result'] == test_df['predict'], 'correct', 'fail')

In [53]:
test_df[['game_id', 'Result', 'predict', 'YN']]

Unnamed: 0,game_id,Result,predict,YN
0,4095401,2,1,fail
1,4095360,1,1,correct
2,4095413,1,1,correct
3,4095364,1,2,fail
4,4095405,2,1,fail
5,4095406,1,1,correct
6,4095417,2,2,correct
7,4095409,2,1,fail
8,4095410,1,1,correct
9,4095419,2,2,correct
