In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb
from sklearn.feature_selection import RFECV

In [2]:
df = pd.read_csv('master_df(train+validation).csv')

In [3]:
test_df = pd.read_csv('master_df(last_testset).csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4432 entries, 0 to 4431
Data columns (total 46 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   game_id                     4432 non-null   int64  
 1   Date                        4432 non-null   object 
 2   Team                        4432 non-null   object 
 3   Opponent                    4432 non-null   object 
 4   club_id                     4432 non-null   int64  
 5   opponent_id                 4432 non-null   int64  
 6   own_manager_name            4432 non-null   object 
 7   opponent_manager_name       4432 non-null   object 
 8   Referee                     4432 non-null   object 
 9   TY                          4432 non-null   int64  
 10  OY                          4432 non-null   int64  
 11  TR                          4432 non-null   int64  
 12  OR                          4432 non-null   int64  
 13  B365TW                      4432 

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 46 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   game_id                     44 non-null     object 
 1   Date                        44 non-null     object 
 2   Team                        44 non-null     object 
 3   Opponent                    44 non-null     object 
 4   club_id                     44 non-null     int64  
 5   opponent_id                 44 non-null     int64  
 6   own_manager_name            44 non-null     object 
 7   opponent_manager_name       44 non-null     object 
 8   Referee                     44 non-null     object 
 9   TY                          44 non-null     int64  
 10  OY                          44 non-null     int64  
 11  TR                          44 non-null     int64  
 12  OR                          44 non-null     int64  
 13  B365TW                      44 non-nu

In [6]:
full_df = pd.concat([df, test_df], axis=0)

In [7]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4476 entries, 0 to 43
Data columns (total 46 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   game_id                     4476 non-null   object 
 1   Date                        4476 non-null   object 
 2   Team                        4476 non-null   object 
 3   Opponent                    4476 non-null   object 
 4   club_id                     4476 non-null   int64  
 5   opponent_id                 4476 non-null   int64  
 6   own_manager_name            4476 non-null   object 
 7   opponent_manager_name       4476 non-null   object 
 8   Referee                     4476 non-null   object 
 9   TY                          4476 non-null   int64  
 10  OY                          4476 non-null   int64  
 11  TR                          4476 non-null   int64  
 12  OR                          4476 non-null   int64  
 13  B365TW                      4476 non-nul

In [8]:
full_df = full_df.sort_values(['club_id', 'Date'])

In [9]:
full_df.reset_index(drop=True, inplace=True)

In [10]:
# 이동평균을 할 컬럼 선정

In [11]:
# 이동평균을 하면 안되는 컬럼 : 
len(['club_id', 'opponent_id', 'B365TW', 'B365D', 'B365OW', 'Table', 'Oppo. Table', 'own_manager_name', 
     'opponent_manager_name', 'Venue', 'Result', 'Referee', 'total_games', 'wins', 'win_percentage', 
     'total_games_vs_opponent', 'wins_vs_opponent', 'win_percentage_vs_opponent'])

18

In [12]:
# 이동평균 해야하는 컬럼 : 
len(['TY', 'OY', 'TR', 'OR', 'Possesion', 'Aerial Duels(%)', 'GF', 'GA',
'Shot on Target', 'Shot on Target(%)', 'Goals per Shot',
'Expected Goals', 'Save%', 'Clean Sheet', 'Pass Completion %',
'Assists', 'Exp. Assisted Goals', 'Expected Assists', 'Tackles Won',
'% of Dribblers Tackled', 'Blocks', 'Interceptions', 'Error', 'market_value_in_eur'])

24

In [13]:
# 위의 컬럼 합쳐서 총 42개 + game_id, Date, 'Team', 'Opponent' 하면 총 46개 -> 개수 맞음

In [14]:
not_app_features = [
    'game_id', 'club_id', 'opponent_id', 'B365TW', 'B365D', 'B365OW', 'Table', 'Oppo. Table',
    'own_manager_name', 'opponent_manager_name', 'Venue', 'Result', 'Referee', 'total_games', 
    'wins', 'win_percentage', 'total_games_vs_opponent', 'wins_vs_opponent', 'win_percentage_vs_opponent'
    ]

In [15]:
len(not_app_features)

19

In [16]:
# 'Date', 'club_id' 는 학습하는 컬럼은 아니지만 인덱스를 위해 남겨두어야 하는 컬럼
# game_id가 아니라 club_id를 남겨두는 이유는 클럽별로 묶은 후 이동평균을 해야하기 때문 
# 그냥 하면 다른팀의 이전경기의 평균값을 가져오게 되므로 우선 club별로 묶은 후 이동평균을 해야하기 때문에 index로 club_id를 남겨둠
# 그래서 위에 이동평균 적용하면 안되는 컬럼에도 들어가고 여기 list에도 들어오게 됨
# game_id는 현재 여기서는 필요없어서 자연스럽게 적지 않음

app_features = [
    'Date', 'club_id', 'TY', 'OY', 'TR', 'OR', 'Possesion', 'Aerial Duels(%)', 'GF', 'GA',
    'Shot on Target', 'Shot on Target(%)', 'Goals per Shot', 'Expected Goals', 'Save%', 'Clean Sheet', 
    'Pass Completion %', 'Assists', 'Exp. Assisted Goals', 'Expected Assists', 'Tackles Won', 
    '% of Dribblers Tackled', 'Blocks', 'Interceptions', 'Error', 'market_value_in_eur'
    ]

In [17]:
len(app_features)

26

In [18]:
# 'game_id', 'Date', 'club_id' index 빼고 모델링 할 컬럼의 리스트

selected_col = ['TY', 'OY', 'TR', 'OR', 'Possesion', 'Aerial Duels(%)', 'GF', 'GA', 'Shot on Target', 
                'Shot on Target(%)', 'Goals per Shot', 'Expected Goals', 'Save%', 'Clean Sheet', 
                'Pass Completion %', 'Assists', 'Exp. Assisted Goals', 'Expected Assists', 'Tackles Won', 
                '% of Dribblers Tackled', 'Blocks', 'Interceptions', 'Error', 'market_value_in_eur'
               ]

In [19]:
full_df[app_features]

Unnamed: 0,Date,club_id,TY,OY,TR,OR,Possesion,Aerial Duels(%),GF,GA,...,Pass Completion %,Assists,Exp. Assisted Goals,Expected Assists,Tackles Won,% of Dribblers Tackled,Blocks,Interceptions,Error,market_value_in_eur
0,2018-08-12,11,2,2,0,0,42,63.6,0,2.0,...,77.1,0,0.5,0.6,14,40.0,13,12,1,3.064286e+07
1,2018-08-18,11,2,0,0,0,38,44.4,2,3.0,...,81.2,1,2.3,1.4,6,25.0,12,12,2,3.235714e+07
2,2018-08-25,11,1,3,0,0,61,50.0,3,1.0,...,82.2,2,1.6,1.4,11,29.2,14,6,1,3.021429e+07
3,2018-09-02,11,4,3,0,0,72,47.8,3,2.0,...,83.3,3,1.1,1.0,9,50.0,6,11,2,3.235714e+07
4,2018-09-15,11,0,0,0,0,63,50.0,2,1.0,...,83.0,0,0.8,1.1,6,50.0,7,7,0,3.235714e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4471,2024-03-31,1237,3,5,0,0,45,47.1,1,2.0,...,83.1,0,0.4,0.4,14,60.0,18,9,0,1.766667e+07
4472,2024-04-03,1237,1,1,0,0,66,55.2,0,0.0,...,90.0,0,1.0,1.1,7,42.9,9,8,0,1.606667e+07
4473,2024-04-06,1237,1,2,0,0,54,47.4,0,3.0,...,84.9,0,0.2,0.5,9,59.1,17,10,0,1.985714e+07
4474,2024-04-13,1237,1,2,0,0,55,59.1,1,1.0,...,84.2,0,0.5,0.9,8,64.7,11,5,1,1.813333e+07


In [20]:
moving_avg_df = full_df[app_features].groupby('club_id')[selected_col].rolling(window=3).mean().shift(1).reset_index(drop=True)

In [21]:
moving_avg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4476 entries, 0 to 4475
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   TY                      4419 non-null   float64
 1   OY                      4419 non-null   float64
 2   TR                      4419 non-null   float64
 3   OR                      4419 non-null   float64
 4   Possesion               4419 non-null   float64
 5   Aerial Duels(%)         4419 non-null   float64
 6   GF                      4419 non-null   float64
 7   GA                      4419 non-null   float64
 8   Shot on Target          4419 non-null   float64
 9   Shot on Target(%)       4419 non-null   float64
 10  Goals per Shot          4419 non-null   float64
 11  Expected Goals          4419 non-null   float64
 12  Save%                   4419 non-null   float64
 13  Clean Sheet             4419 non-null   float64
 14  Pass Completion %       4419 non-null   

In [22]:
moving_avg_df.isnull().sum()

TY                        57
OY                        57
TR                        57
OR                        57
Possesion                 57
Aerial Duels(%)           57
GF                        57
GA                        57
Shot on Target            57
Shot on Target(%)         57
Goals per Shot            57
Expected Goals            57
Save%                     57
Clean Sheet               57
Pass Completion %         57
Assists                   57
Exp. Assisted Goals       57
Expected Assists          57
Tackles Won               57
% of Dribblers Tackled    57
Blocks                    57
Interceptions             57
Error                     57
market_value_in_eur       57
dtype: int64

In [23]:
moving_avg_df

Unnamed: 0,TY,OY,TR,OR,Possesion,Aerial Duels(%),GF,GA,Shot on Target,Shot on Target(%),...,Pass Completion %,Assists,Exp. Assisted Goals,Expected Assists,Tackles Won,% of Dribblers Tackled,Blocks,Interceptions,Error,market_value_in_eur
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,1.666667,1.666667,0.000000,0.0,47.000000,52.666667,1.666667,2.000000,6.333333,44.033333,...,80.166667,1.000000,1.466667,1.133333,10.333333,31.400000,13.000000,10.000000,1.333333,3.107143e+07
4,2.333333,2.000000,0.000000,0.0,57.000000,47.400000,2.666667,2.000000,9.000000,54.500000,...,82.233333,2.000000,1.666667,1.266667,8.666667,34.733333,10.666667,9.666667,1.666667,3.164286e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4471,2.666667,2.666667,0.333333,0.0,65.666667,60.966667,0.666667,1.333333,5.000000,31.233333,...,87.800000,0.333333,0.833333,0.966667,8.333333,45.600000,10.000000,5.666667,1.333333,1.901042e+07
4472,3.000000,3.333333,0.000000,0.0,59.333333,55.233333,0.666667,1.666667,3.666667,32.200000,...,86.366667,0.000000,0.566667,0.766667,10.333333,57.266667,14.000000,7.333333,1.333333,1.854514e+07
4473,2.666667,3.333333,0.000000,0.0,58.000000,54.100000,0.666667,0.666667,4.000000,29.433333,...,87.600000,0.000000,0.533333,0.666667,8.666667,47.133333,12.000000,7.333333,0.333333,1.759861e+07
4474,1.666667,2.666667,0.000000,0.0,55.000000,49.900000,0.333333,1.666667,3.666667,26.100000,...,86.000000,0.000000,0.533333,0.666667,10.000000,54.000000,14.666667,9.000000,0.000000,1.786349e+07


In [24]:
full_df

Unnamed: 0,game_id,Date,Team,Opponent,club_id,opponent_id,own_manager_name,opponent_manager_name,Referee,TY,...,Blocks,Interceptions,Error,market_value_in_eur,total_games,wins,win_percentage,total_games_vs_opponent,wins_vs_opponent,win_percentage_vs_opponent
0,3050167,2018-08-12,Arsenal,Manchester City,11,281,Unai Emery,Pep Guardiola,M Oliver,2,...,13,12,1,3.064286e+07,0,0.0,0.000000,0,0.0,0.000000
1,3050180,2018-08-18,Arsenal,Chelsea,11,631,Unai Emery,Maurizio Sarri,M Atkinson,2,...,12,12,2,3.235714e+07,1,0.0,0.000000,0,0.0,0.000000
2,3050187,2018-08-25,Arsenal,West Ham,11,379,Unai Emery,Manuel Pellegrini,G Scott,1,...,14,6,1,3.021429e+07,2,0.0,0.000000,0,0.0,0.000000
3,3050199,2018-09-02,Arsenal,Cardiff City,11,603,Unai Emery,Neil Warnock,A Taylor,4,...,6,11,2,3.235714e+07,3,1.0,33.333333,0,0.0,0.000000
4,3050212,2018-09-15,Arsenal,Newcastle Utd,11,762,Unai Emery,Rafael Benítez,L Probert,0,...,7,7,0,3.235714e+07,4,2.0,50.000000,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4471,4095374,2024-03-31,Brighton & Hove Albion,Liverpool,1237,31,Roberto De Zerbi,Jürgen Klopp,D Coote,3,...,18,9,0,1.766667e+07,218,68.0,31.192661,11,2.0,18.181818
4472,4095382,2024-04-03,Brighton & Hove Albion,Brentford,1237,1148,Roberto De Zerbi,Thomas Frank,A Madley,1,...,9,8,0,1.606667e+07,219,68.0,31.050228,5,3.0,60.000000
4473,4095391,2024-04-06,Brighton & Hove Albion,Arsenal,1237,11,Roberto De Zerbi,Mikel Arteta,J Brooks,1,...,17,10,0,1.985714e+07,220,68.0,30.909091,11,4.0,36.363636
4474,4095403,2024-04-13,Brighton & Hove Albion,Burnley,1237,1132,Roberto De Zerbi,Vincent Kompany,S Hooper,1,...,11,5,1,1.813333e+07,221,68.0,30.769231,9,2.0,22.222222


In [25]:
df_after_moving = pd.concat([full_df[not_app_features], moving_avg_df], axis=1)

In [26]:
df_after_moving

Unnamed: 0,game_id,club_id,opponent_id,B365TW,B365D,B365OW,Table,Oppo. Table,own_manager_name,opponent_manager_name,...,Pass Completion %,Assists,Exp. Assisted Goals,Expected Assists,Tackles Won,% of Dribblers Tackled,Blocks,Interceptions,Error,market_value_in_eur
0,3050167,11,281,25.000000,26.315789,51.282051,15,3,Unai Emery,Pep Guardiola,...,,,,,,,,,,
1,3050180,11,631,22.222222,25.000000,55.555556,17,3,Unai Emery,Maurizio Sarri,...,,,,,,,,,,
2,3050187,11,379,73.529412,18.181818,11.111111,9,20,Unai Emery,Manuel Pellegrini,...,,,,,,,,,,
3,3050199,11,603,63.694268,23.094688,15.384615,9,16,Unai Emery,Neil Warnock,...,80.166667,1.000000,1.466667,1.133333,10.333333,31.400000,13.000000,10.000000,1.333333,3.107143e+07
4,3050212,11,762,52.631579,26.315789,23.809524,7,19,Unai Emery,Rafael Benítez,...,82.233333,2.000000,1.666667,1.266667,8.666667,34.733333,10.666667,9.666667,1.666667,3.164286e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4471,4095374,1237,31,13.333333,18.181818,73.529412,9,1,Roberto De Zerbi,Jürgen Klopp,...,87.800000,0.333333,0.833333,0.966667,8.333333,45.600000,10.000000,5.666667,1.333333,1.901042e+07
4472,4095382,1237,1148,36.363636,26.315789,42.016807,9,15,Roberto De Zerbi,Thomas Frank,...,86.366667,0.000000,0.566667,0.766667,10.333333,57.266667,14.000000,7.333333,1.333333,1.854514e+07
4473,4095391,1237,11,20.000000,23.094688,61.728395,10,1,Roberto De Zerbi,Mikel Arteta,...,87.600000,0.000000,0.533333,0.666667,8.666667,47.133333,12.000000,7.333333,0.333333,1.759861e+07
4474,4095403,1237,1132,28.571429,26.666667,50.000000,10,19,Roberto De Zerbi,Vincent Kompany,...,86.000000,0.000000,0.533333,0.666667,10.000000,54.000000,14.666667,9.000000,0.000000,1.786349e+07


In [27]:
final_df = df_after_moving.dropna()

In [28]:
final_df

Unnamed: 0,game_id,club_id,opponent_id,B365TW,B365D,B365OW,Table,Oppo. Table,own_manager_name,opponent_manager_name,...,Pass Completion %,Assists,Exp. Assisted Goals,Expected Assists,Tackles Won,% of Dribblers Tackled,Blocks,Interceptions,Error,market_value_in_eur
3,3050199,11,603,63.694268,23.094688,15.384615,9,16,Unai Emery,Neil Warnock,...,80.166667,1.000000,1.466667,1.133333,10.333333,31.400000,13.000000,10.000000,1.333333,3.107143e+07
4,3050212,11,762,52.631579,26.315789,23.809524,7,19,Unai Emery,Rafael Benítez,...,82.233333,2.000000,1.666667,1.266667,8.666667,34.733333,10.666667,9.666667,1.666667,3.164286e+07
5,3050217,11,29,69.444444,20.000000,13.333333,6,12,Unai Emery,Marco Silva,...,82.833333,1.666667,1.166667,1.166667,8.666667,43.066667,9.000000,8.000000,1.000000,3.164286e+07
6,3050227,11,1010,69.444444,19.607843,14.285714,5,6,Unai Emery,Javi Gracia,...,82.600000,1.666667,0.866667,1.066667,9.000000,43.866667,8.666667,9.000000,0.666667,3.188095e+07
7,3050240,11,931,60.240964,23.094688,19.047619,4,17,Unai Emery,Slavisa Jokanovic,...,81.433333,1.000000,0.833333,1.000000,7.666667,38.300000,11.000000,7.666667,0.333333,3.140476e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4471,4095374,1237,31,13.333333,18.181818,73.529412,9,1,Roberto De Zerbi,Jürgen Klopp,...,87.800000,0.333333,0.833333,0.966667,8.333333,45.600000,10.000000,5.666667,1.333333,1.901042e+07
4472,4095382,1237,1148,36.363636,26.315789,42.016807,9,15,Roberto De Zerbi,Thomas Frank,...,86.366667,0.000000,0.566667,0.766667,10.333333,57.266667,14.000000,7.333333,1.333333,1.854514e+07
4473,4095391,1237,11,20.000000,23.094688,61.728395,10,1,Roberto De Zerbi,Mikel Arteta,...,87.600000,0.000000,0.533333,0.666667,8.666667,47.133333,12.000000,7.333333,0.333333,1.759861e+07
4474,4095403,1237,1132,28.571429,26.666667,50.000000,10,19,Roberto De Zerbi,Vincent Kompany,...,86.000000,0.000000,0.533333,0.666667,10.000000,54.000000,14.666667,9.000000,0.000000,1.786349e+07


In [29]:
final_df.isnull().sum()

game_id                       0
club_id                       0
opponent_id                   0
B365TW                        0
B365D                         0
B365OW                        0
Table                         0
Oppo. Table                   0
own_manager_name              0
opponent_manager_name         0
Venue                         0
Result                        0
Referee                       0
total_games                   0
wins                          0
win_percentage                0
total_games_vs_opponent       0
wins_vs_opponent              0
win_percentage_vs_opponent    0
TY                            0
OY                            0
TR                            0
OR                            0
Possesion                     0
Aerial Duels(%)               0
GF                            0
GA                            0
Shot on Target                0
Shot on Target(%)             0
Goals per Shot                0
Expected Goals                0
Save%   

In [30]:
# moving average까지 완료

In [31]:
# 감독이름은 나중에 하면 own, opponet team의 감독이름을 따로 라벨링하기 어려워서 먼저 함

In [32]:
# 범주형 컬럼 라벨링 시작

In [33]:
# own_manager_name 인코딩
label_encoder = LabelEncoder()
label_encoder.fit(final_df['own_manager_name'])

In [34]:
# 변환
final_df['own_manager_name'] = label_encoder.transform(final_df['own_manager_name'])

In [35]:
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

In [36]:
label_mapping

{'Aaron Danks': 0,
 'Adam Sadler': 1,
 'Albert Stuivenberg': 2,
 'Andoni Iraola': 3,
 'Ange Postecoglou': 4,
 'Antonio Conte': 5,
 'Björn Hamberg': 6,
 'Brendan Rodgers': 7,
 'Bruno Lage': 8,
 'Bruno Saltor': 9,
 'Carlo Ancelotti': 10,
 'Chris Hughton': 11,
 'Chris Wilder': 12,
 'Claude Puel': 13,
 'Claudio Ranieri': 14,
 'Cristian Stellini': 15,
 'Daniel Farke': 16,
 'David Moyes': 17,
 'David Wagner': 18,
 'Dean Smith': 19,
 'Duncan Ferguson': 20,
 'Eddie Howe': 21,
 'Erik ten Hag': 22,
 'Frank Lampard': 23,
 'Freddie Ljungberg': 24,
 "Gary O'Neil": 25,
 'Graeme Jones': 26,
 'Graham Potter': 27,
 'Hayden Mullins': 28,
 'Ian Woan': 29,
 'Jan Siewert': 30,
 'Jason Tindall': 31,
 'Javi Gracia': 32,
 'Jesse Marsch': 33,
 'José Mourinho': 34,
 'Julen Lopetegui': 35,
 'Jürgen Klopp': 36,
 'Kelvin Davis': 37,
 'Manuel Pellegrini': 38,
 'Marcelo Bielsa': 39,
 'Marco Silva': 40,
 'Mark Hudson': 41,
 'Mark Hughes': 42,
 'Mauricio Pochettino': 43,
 'Maurizio Sarri': 44,
 'Michael Carrick': 45,


In [37]:
# opponent_manager_name 인코딩
label_encoder2 = LabelEncoder()
label_encoder2.fit(final_df['opponent_manager_name'])

In [38]:
# test_set 변환
final_df['opponent_manager_name'] = label_encoder2.transform(final_df['opponent_manager_name'])

In [39]:
label_mapping2 = dict(zip(label_encoder2.classes_, range(len(label_encoder2.classes_))))

In [40]:
label_mapping == label_mapping2

True

In [41]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4419 entries, 3 to 4475
Data columns (total 43 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   game_id                     4419 non-null   object 
 1   club_id                     4419 non-null   int64  
 2   opponent_id                 4419 non-null   int64  
 3   B365TW                      4419 non-null   float64
 4   B365D                       4419 non-null   float64
 5   B365OW                      4419 non-null   float64
 6   Table                       4419 non-null   int64  
 7   Oppo. Table                 4419 non-null   int64  
 8   own_manager_name            4419 non-null   int64  
 9   opponent_manager_name       4419 non-null   int64  
 10  Venue                       4419 non-null   object 
 11  Result                      4419 non-null   object 
 12  Referee                     4419 non-null   object 
 13  total_games                 4419 non-n

In [42]:
# own, opponent팀 나누기 

In [43]:
home_df = final_df[final_df['Venue'] == 'Home']

In [44]:
away_df = final_df[final_df['Venue'] == 'Away']

In [45]:
merged_df = pd.merge(home_df, away_df, on='game_id')

In [46]:
merged_df

Unnamed: 0,game_id,club_id_x,opponent_id_x,B365TW_x,B365D_x,B365OW_x,Table_x,Oppo. Table_x,own_manager_name_x,opponent_manager_name_x,...,Pass Completion %_y,Assists_y,Exp. Assisted Goals_y,Expected Assists_y,Tackles Won_y,% of Dribblers Tackled_y,Blocks_y,Interceptions_y,Error_y,market_value_in_eur_y
0,3050217,11,29,69.444444,20.000000,13.333333,6,12,82,40,...,77.633333,1.333333,0.766667,0.866667,12.333333,50.566667,9.666667,10.000000,0.666667,1.792857e+07
1,3050227,11,1010,69.444444,19.607843,14.285714,5,6,82,32,...,65.400000,1.333333,0.766667,0.733333,12.000000,52.166667,13.000000,16.000000,0.333333,7.142857e+06
2,3050247,11,1003,65.359477,22.222222,15.384615,4,11,82,13,...,78.266667,1.333333,1.000000,0.900000,12.333333,47.633333,13.333333,9.333333,0.333333,1.522344e+07
3,3050267,11,31,25.641026,25.641026,51.282051,5,3,82,36,...,82.333333,1.000000,0.800000,0.800000,8.666667,49.700000,10.666667,8.000000,0.666667,4.684615e+07
4,3050277,11,543,60.240964,23.809524,19.047619,5,11,82,53,...,78.300000,0.000000,0.900000,0.733333,9.666667,39.900000,10.333333,13.000000,0.000000,1.277381e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2194,4095279,1237,543,60.606061,23.094688,20.000000,7,11,67,25,...,72.400000,2.000000,1.200000,0.900000,13.000000,52.166667,12.666667,10.000000,0.666667,1.643611e+07
2195,4095299,1237,873,58.823529,25.000000,21.052632,8,14,67,68,...,77.433333,1.666667,0.733333,0.833333,10.666667,56.933333,11.333333,7.666667,0.000000,1.926758e+07
2196,4095333,1237,29,54.644809,26.315789,23.809524,7,15,67,73,...,73.733333,0.666667,1.200000,0.766667,15.000000,59.133333,13.333333,13.666667,0.000000,1.680149e+07
2197,4095353,1237,703,55.555556,25.000000,25.000000,8,17,67,53,...,75.666667,1.333333,1.866667,0.900000,10.333333,57.000000,14.666667,8.000000,0.333333,1.668189e+07


In [47]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 85 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   game_id                       2199 non-null   object 
 1   club_id_x                     2199 non-null   int64  
 2   opponent_id_x                 2199 non-null   int64  
 3   B365TW_x                      2199 non-null   float64
 4   B365D_x                       2199 non-null   float64
 5   B365OW_x                      2199 non-null   float64
 6   Table_x                       2199 non-null   int64  
 7   Oppo. Table_x                 2199 non-null   int64  
 8   own_manager_name_x            2199 non-null   int64  
 9   opponent_manager_name_x       2199 non-null   int64  
 10  Venue_x                       2199 non-null   object 
 11  Result_x                      2199 non-null   object 
 12  Referee_x                     2199 non-null   object 
 13  tot

In [48]:
merged_df['Venue_x'] = merged_df['club_id_x']

In [49]:
# 삭제해야 할 행
'club_id_y', 'opponent_id_y', 'B365TW_y', 'B365D_y', 'B365OW_y', 'Table_y', 'Oppo. Table_y', 'own_manager_name_y', 
'opponent_manager_name_y', 'Referee_y', 'Venue_y', 'Result_y'

('opponent_manager_name_y', 'Referee_y', 'Venue_y', 'Result_y')

In [50]:
merged_df = merged_df.drop(['club_id_y', 'opponent_id_y', 'B365TW_y', 'B365D_y', 'B365OW_y', 'Table_y', 'Oppo. Table_y', 
                'own_manager_name_y', 'opponent_manager_name_y', 'Referee_y', 'Venue_y', 'Result_y'], axis=1)

In [51]:
merged_df

Unnamed: 0,game_id,club_id_x,opponent_id_x,B365TW_x,B365D_x,B365OW_x,Table_x,Oppo. Table_x,own_manager_name_x,opponent_manager_name_x,...,Pass Completion %_y,Assists_y,Exp. Assisted Goals_y,Expected Assists_y,Tackles Won_y,% of Dribblers Tackled_y,Blocks_y,Interceptions_y,Error_y,market_value_in_eur_y
0,3050217,11,29,69.444444,20.000000,13.333333,6,12,82,40,...,77.633333,1.333333,0.766667,0.866667,12.333333,50.566667,9.666667,10.000000,0.666667,1.792857e+07
1,3050227,11,1010,69.444444,19.607843,14.285714,5,6,82,32,...,65.400000,1.333333,0.766667,0.733333,12.000000,52.166667,13.000000,16.000000,0.333333,7.142857e+06
2,3050247,11,1003,65.359477,22.222222,15.384615,4,11,82,13,...,78.266667,1.333333,1.000000,0.900000,12.333333,47.633333,13.333333,9.333333,0.333333,1.522344e+07
3,3050267,11,31,25.641026,25.641026,51.282051,5,3,82,36,...,82.333333,1.000000,0.800000,0.800000,8.666667,49.700000,10.666667,8.000000,0.666667,4.684615e+07
4,3050277,11,543,60.240964,23.809524,19.047619,5,11,82,53,...,78.300000,0.000000,0.900000,0.733333,9.666667,39.900000,10.333333,13.000000,0.000000,1.277381e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2194,4095279,1237,543,60.606061,23.094688,20.000000,7,11,67,25,...,72.400000,2.000000,1.200000,0.900000,13.000000,52.166667,12.666667,10.000000,0.666667,1.643611e+07
2195,4095299,1237,873,58.823529,25.000000,21.052632,8,14,67,68,...,77.433333,1.666667,0.733333,0.833333,10.666667,56.933333,11.333333,7.666667,0.000000,1.926758e+07
2196,4095333,1237,29,54.644809,26.315789,23.809524,7,15,67,73,...,73.733333,0.666667,1.200000,0.766667,15.000000,59.133333,13.333333,13.666667,0.000000,1.680149e+07
2197,4095353,1237,703,55.555556,25.000000,25.000000,8,17,67,53,...,75.666667,1.333333,1.866667,0.900000,10.333333,57.000000,14.666667,8.000000,0.333333,1.668189e+07


In [52]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 73 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   game_id                       2199 non-null   object 
 1   club_id_x                     2199 non-null   int64  
 2   opponent_id_x                 2199 non-null   int64  
 3   B365TW_x                      2199 non-null   float64
 4   B365D_x                       2199 non-null   float64
 5   B365OW_x                      2199 non-null   float64
 6   Table_x                       2199 non-null   int64  
 7   Oppo. Table_x                 2199 non-null   int64  
 8   own_manager_name_x            2199 non-null   int64  
 9   opponent_manager_name_x       2199 non-null   int64  
 10  Venue_x                       2199 non-null   int64  
 11  Result_x                      2199 non-null   object 
 12  Referee_x                     2199 non-null   object 
 13  tot

In [53]:
# 열 이름 변경하기

In [54]:
merged_df = merged_df.rename(columns={'opponent_id_x': 'club_id_y', 'B365TW_x': 'B365HW', 'B365D_x': 'B365D',
                                     'B365OW_x': 'B365AW', 'Oppo. Table_x': 'Table_y', 'Referee_x':'Referee',
                                     'own_manager_name_x': 'manager_name_x', 'opponent_manager_name_x':'manager_name_y',
                                     'Result_x': 'Result', 'Venue_x':'home_team_id'})

In [55]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 73 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   game_id                       2199 non-null   object 
 1   club_id_x                     2199 non-null   int64  
 2   club_id_y                     2199 non-null   int64  
 3   B365HW                        2199 non-null   float64
 4   B365D                         2199 non-null   float64
 5   B365AW                        2199 non-null   float64
 6   Table_x                       2199 non-null   int64  
 7   Table_y                       2199 non-null   int64  
 8   manager_name_x                2199 non-null   int64  
 9   manager_name_y                2199 non-null   int64  
 10  home_team_id                  2199 non-null   int64  
 11  Result                        2199 non-null   object 
 12  Referee                       2199 non-null   object 
 13  tot

In [56]:
merged_df['Result']

0       W
1       W
2       W
3       D
4       D
       ..
2194    D
2195    W
2196    D
2197    W
2198    L
Name: Result, Length: 2199, dtype: object

In [57]:
def chage_result(row):
    if row == 'W':
        return 'XW'
    elif row == 'L':
        return 'YW'
    else :
        return 'D'

In [58]:
merged_df['Result'] = merged_df['Result'].apply(chage_result)

In [59]:
merged_df

Unnamed: 0,game_id,club_id_x,club_id_y,B365HW,B365D,B365AW,Table_x,Table_y,manager_name_x,manager_name_y,...,Pass Completion %_y,Assists_y,Exp. Assisted Goals_y,Expected Assists_y,Tackles Won_y,% of Dribblers Tackled_y,Blocks_y,Interceptions_y,Error_y,market_value_in_eur_y
0,3050217,11,29,69.444444,20.000000,13.333333,6,12,82,40,...,77.633333,1.333333,0.766667,0.866667,12.333333,50.566667,9.666667,10.000000,0.666667,1.792857e+07
1,3050227,11,1010,69.444444,19.607843,14.285714,5,6,82,32,...,65.400000,1.333333,0.766667,0.733333,12.000000,52.166667,13.000000,16.000000,0.333333,7.142857e+06
2,3050247,11,1003,65.359477,22.222222,15.384615,4,11,82,13,...,78.266667,1.333333,1.000000,0.900000,12.333333,47.633333,13.333333,9.333333,0.333333,1.522344e+07
3,3050267,11,31,25.641026,25.641026,51.282051,5,3,82,36,...,82.333333,1.000000,0.800000,0.800000,8.666667,49.700000,10.666667,8.000000,0.666667,4.684615e+07
4,3050277,11,543,60.240964,23.809524,19.047619,5,11,82,53,...,78.300000,0.000000,0.900000,0.733333,9.666667,39.900000,10.333333,13.000000,0.000000,1.277381e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2194,4095279,1237,543,60.606061,23.094688,20.000000,7,11,67,25,...,72.400000,2.000000,1.200000,0.900000,13.000000,52.166667,12.666667,10.000000,0.666667,1.643611e+07
2195,4095299,1237,873,58.823529,25.000000,21.052632,8,14,67,68,...,77.433333,1.666667,0.733333,0.833333,10.666667,56.933333,11.333333,7.666667,0.000000,1.926758e+07
2196,4095333,1237,29,54.644809,26.315789,23.809524,7,15,67,73,...,73.733333,0.666667,1.200000,0.766667,15.000000,59.133333,13.333333,13.666667,0.000000,1.680149e+07
2197,4095353,1237,703,55.555556,25.000000,25.000000,8,17,67,53,...,75.666667,1.333333,1.866667,0.900000,10.333333,57.000000,14.666667,8.000000,0.333333,1.668189e+07


In [60]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 73 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   game_id                       2199 non-null   object 
 1   club_id_x                     2199 non-null   int64  
 2   club_id_y                     2199 non-null   int64  
 3   B365HW                        2199 non-null   float64
 4   B365D                         2199 non-null   float64
 5   B365AW                        2199 non-null   float64
 6   Table_x                       2199 non-null   int64  
 7   Table_y                       2199 non-null   int64  
 8   manager_name_x                2199 non-null   int64  
 9   manager_name_y                2199 non-null   int64  
 10  home_team_id                  2199 non-null   int64  
 11  Result                        2199 non-null   object 
 12  Referee                       2199 non-null   object 
 13  tot

In [61]:
# 라벨 인코더 객체 생성
label_encoder3 = LabelEncoder()

# 라벨 인코딩 적용
merged_df['Referee'] = label_encoder3.fit_transform(merged_df['Referee'])

In [62]:
label_mapping3 = dict(zip(label_encoder3.classes_, range(len(label_encoder3.classes_))))
label_mapping3

{'A Madley': 0,
 'A Marriner': 1,
 'A Moss': 2,
 'A Taylor': 3,
 'C Kavanagh': 4,
 'C Pawson': 5,
 'C Salisbury': 6,
 'D Bond': 7,
 'D Coote': 8,
 'D England': 9,
 'G Scott': 10,
 'J Brooks': 11,
 'J Gillett': 12,
 'J Moss': 13,
 'J Smith': 14,
 'K Friend': 15,
 'L Mason': 16,
 'L Probert': 17,
 'L Smith': 18,
 'M Atkinson': 19,
 'M Dean': 20,
 'M Oliver': 21,
 'M Salisbury': 22,
 'O Langford': 23,
 'P Bankes': 24,
 'P Tierney': 25,
 'R East': 26,
 'R Jones': 27,
 'R Madley': 28,
 'R Welch': 29,
 'S Allison': 30,
 'S Attwell': 31,
 'S Barrott': 32,
 'S Hooper': 33,
 'S Scott': 34,
 'S Singh': 35,
 'T Bramall': 36,
 'T Harrington': 37,
 'T Robinson': 38}

In [63]:
# 라벨 인코더 객체 생성
label_encoder4 = LabelEncoder()

# 라벨 인코딩 적용
merged_df['Result'] = label_encoder4.fit_transform(merged_df['Result'])

In [64]:
label_mapping4 = dict(zip(label_encoder4.classes_, range(len(label_encoder4.classes_))))
label_mapping4

{'D': 0, 'XW': 1, 'YW': 2}

In [65]:
merged_df.columns

Index(['game_id', 'club_id_x', 'club_id_y', 'B365HW', 'B365D', 'B365AW',
       'Table_x', 'Table_y', 'manager_name_x', 'manager_name_y',
       'home_team_id', 'Result', 'Referee', 'total_games_x', 'wins_x',
       'win_percentage_x', 'total_games_vs_opponent_x', 'wins_vs_opponent_x',
       'win_percentage_vs_opponent_x', 'TY_x', 'OY_x', 'TR_x', 'OR_x',
       'Possesion_x', 'Aerial Duels(%)_x', 'GF_x', 'GA_x', 'Shot on Target_x',
       'Shot on Target(%)_x', 'Goals per Shot_x', 'Expected Goals_x',
       'Save%_x', 'Clean Sheet_x', 'Pass Completion %_x', 'Assists_x',
       'Exp. Assisted Goals_x', 'Expected Assists_x', 'Tackles Won_x',
       '% of Dribblers Tackled_x', 'Blocks_x', 'Interceptions_x', 'Error_x',
       'market_value_in_eur_x', 'total_games_y', 'wins_y', 'win_percentage_y',
       'total_games_vs_opponent_y', 'wins_vs_opponent_y',
       'win_percentage_vs_opponent_y', 'TY_y', 'OY_y', 'TR_y', 'OR_y',
       'Possesion_y', 'Aerial Duels(%)_y', 'GF_y', 'GA_y', 'Shot o

In [66]:
#merged_df.to_csv('master_df_add_moving_encoding(train+test).csv', index=False)

In [67]:
# train_validation set과 test set으로 나누기

In [68]:
test_df = merged_df[merged_df['game_id'].str.contains('TEST_', na=False)]

In [69]:
#test_df.to_csv('test_set.csv', index=False)

In [70]:
# 'game_id' 열에서 'TEST_'를 포함하지 않은 행 추출
train_validation_df = merged_df[~merged_df['game_id'].str.contains('TEST_', na=False)]

In [71]:
#train_validation_df.to_csv('train_validation_set.csv', index=False)

In [72]:
len(train_validation_df.columns)

73

In [73]:
train_validation_df.columns

Index(['game_id', 'club_id_x', 'club_id_y', 'B365HW', 'B365D', 'B365AW',
       'Table_x', 'Table_y', 'manager_name_x', 'manager_name_y',
       'home_team_id', 'Result', 'Referee', 'total_games_x', 'wins_x',
       'win_percentage_x', 'total_games_vs_opponent_x', 'wins_vs_opponent_x',
       'win_percentage_vs_opponent_x', 'TY_x', 'OY_x', 'TR_x', 'OR_x',
       'Possesion_x', 'Aerial Duels(%)_x', 'GF_x', 'GA_x', 'Shot on Target_x',
       'Shot on Target(%)_x', 'Goals per Shot_x', 'Expected Goals_x',
       'Save%_x', 'Clean Sheet_x', 'Pass Completion %_x', 'Assists_x',
       'Exp. Assisted Goals_x', 'Expected Assists_x', 'Tackles Won_x',
       '% of Dribblers Tackled_x', 'Blocks_x', 'Interceptions_x', 'Error_x',
       'market_value_in_eur_x', 'total_games_y', 'wins_y', 'win_percentage_y',
       'total_games_vs_opponent_y', 'wins_vs_opponent_y',
       'win_percentage_vs_opponent_y', 'TY_y', 'OY_y', 'TR_y', 'OR_y',
       'Possesion_y', 'Aerial Duels(%)_y', 'GF_y', 'GA_y', 'Shot o

In [74]:
independent_variable = ['club_id_x', 'club_id_y', 'B365HW', 'B365D', 'B365AW',
       'Table_x', 'Table_y', 'manager_name_x', 'manager_name_y',
       'home_team_id', 'Referee', 'total_games_x', 'wins_x',
       'win_percentage_x', 'total_games_vs_opponent_x', 'wins_vs_opponent_x',
       'win_percentage_vs_opponent_x', 'TY_x', 'OY_x', 'TR_x', 'OR_x',
       'Possesion_x', 'Aerial Duels(%)_x', 'GF_x', 'GA_x', 'Shot on Target_x',
       'Shot on Target(%)_x', 'Goals per Shot_x', 'Expected Goals_x',
       'Save%_x', 'Clean Sheet_x', 'Pass Completion %_x', 'Assists_x',
       'Exp. Assisted Goals_x', 'Expected Assists_x', 'Tackles Won_x',
       '% of Dribblers Tackled_x', 'Blocks_x', 'Interceptions_x', 'Error_x',
       'market_value_in_eur_x', 'total_games_y', 'wins_y', 'win_percentage_y',
       'total_games_vs_opponent_y', 'wins_vs_opponent_y',
       'win_percentage_vs_opponent_y', 'TY_y', 'OY_y', 'TR_y', 'OR_y',
       'Possesion_y', 'Aerial Duels(%)_y', 'GF_y', 'GA_y', 'Shot on Target_y',
       'Shot on Target(%)_y', 'Goals per Shot_y', 'Expected Goals_y',
       'Save%_y', 'Clean Sheet_y', 'Pass Completion %_y', 'Assists_y',
       'Exp. Assisted Goals_y', 'Expected Assists_y', 'Tackles Won_y',
       '% of Dribblers Tackled_y', 'Blocks_y', 'Interceptions_y', 'Error_y',
       'market_value_in_eur_y']

In [75]:
len(independent_variable)

71

In [76]:
# 전처리 없이 쌩으로 모델링 시작

In [77]:
# 데이터셋 train과 test로 나누기
X = train_validation_df[independent_variable]
y = train_validation_df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [78]:
# 1. RandomForestClassifier

In [79]:
# 쌩으로 했을 때 가장 좋은 점수

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5665137614678899
F1 Score: 0.5101199630992969


In [99]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5688073394495413
F1 Score: 0.49720234877930675


In [80]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5412844036697247
F1 Score: 0.4801577415801329


In [81]:
rf_model = RandomForestClassifier(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5458715596330275
F1 Score: 0.4835739224259119


In [82]:
rf_model = RandomForestClassifier(n_estimators=400, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5527522935779816
F1 Score: 0.48518541021123185


In [88]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5688073394495413
F1 Score: 0.49720234877930675


In [89]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5527522935779816
F1 Score: 0.4827587407892145


In [90]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.555045871559633
F1 Score: 0.4837093440077317


In [92]:
rf_model = RandomForestClassifier(n_estimators=300, max_depth=9, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5665137614678899
F1 Score: 0.498695477225173


In [91]:
# min_samples_split 추가하니까 과적합이 방지되므로 정확도는 약간 떨어진다. 정확도를 기준으로 한다면 사용 안하는게 나을 것 같다.
# min_samples_split 숫자가 커질수록 과적합 방지, default=2

rf_model = RandomForestClassifier(n_estimators=300, max_depth=9, min_samples_split=3, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5665137614678899
F1 Score: 0.4955563436492


In [93]:
# min_samples_leaf 추가하니까 과적합이 방지되므로 정확도는 약간 떨어진다.
# min_samples_leaf 숫자가 커질수록 과적합 방지, default=1

rf_model = RandomForestClassifier(n_estimators=300, max_depth=9, min_samples_leaf=2, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5665137614678899
F1 Score: 0.49601067829748835


In [101]:
# 2. XGBClassifier

In [102]:
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5389908256880734
F1 Score: 0.5158501096408834


In [103]:
xgb_model = xgb.XGBClassifier(n_estimators=200, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5252293577981652
F1 Score: 0.5030681192201881


In [104]:
xgb_model = xgb.XGBClassifier(n_estimators=300, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.4982918241767089


In [105]:
xgb_model = xgb.XGBClassifier(n_estimators=400, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5160550458715596
F1 Score: 0.4929500279059744


In [106]:
xgb_model = xgb.XGBClassifier(n_estimators=500, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5160550458715596
F1 Score: 0.4929500279059744


In [107]:
xgb_model = xgb.XGBClassifier(n_estimators=600, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.49779538180218863


In [108]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5481651376146789
F1 Score: 0.5085479198275221


In [109]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=2, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5321100917431193
F1 Score: 0.5046808607934973


In [110]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=3, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5435779816513762
F1 Score: 0.5207071815780776


In [111]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=4, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5091743119266054
F1 Score: 0.48855927587142733


In [112]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5321100917431193
F1 Score: 0.506402220648877


In [113]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.4982918241767089


In [114]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.49837625397927754


In [115]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=8, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5412844036697247
F1 Score: 0.5130596057600417


In [116]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=9, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5298165137614679
F1 Score: 0.5014177009106724


In [117]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, learning_rate=0.3, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.49837625397927754


In [118]:
# min_child_weight의 값이 클수록 트리의 깊이가 줄어들어 모델이 더 간단해지는 경향이 있음, 그래서 과적합을 줄이는 데 도움

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, min_child_weight=1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.49837625397927754


In [119]:
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, min_child_weight=2, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5504587155963303
F1 Score: 0.5253257266209236


In [120]:
# gamma :최소 손실 감소 값, 이 값보다 손실이 적게 줄어들면 해당 분할이 수행되지 않음, 과적합 방지
# 기본값 0

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, gamma=1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5412844036697247
F1 Score: 0.5101129278587345


In [121]:
# leg_lambda : L2 정규화는 모델의 가중치를 제한하여 모델이 과적합되지 않도록 도와줌. lambda 값이 클수록 가중치의 크기가 작아지고 모델이 더 간단해지는 경향
# 기본값 1

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, leg_lambda=1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.49837625397927754


In [122]:
# 계속 크기가 커져도 성능이 같음.

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, leg_lambda=4, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.49837625397927754


In [123]:
# L1 정규화는 모델의 가중치를 제한하여 모델이 과적합되지 않도록 도와줌. alpha 값이 클수록 모델의 가중치의 절대값이 작아지고, 따라서 모델이 더 간단해지는 경향
# 기본값 0

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, alpha=0, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5206422018348624
F1 Score: 0.49837625397927754


In [124]:
# 과적합 방지 파라미터이다 보니 숫자가 커질수록 정확도 감소

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=7, alpha=1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5321100917431193
F1 Score: 0.4983929593260239


In [125]:
# 3. GradientBoostingClassifier

In [126]:
gb_model = GradientBoostingClassifier(n_estimators=80, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5596330275229358
F1 Score: 0.5167523083011017


In [127]:
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.555045871559633
F1 Score: 0.5136959045435708


In [128]:
gb_model = GradientBoostingClassifier(n_estimators=150, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5389908256880734
F1 Score: 0.5015957282173813


In [129]:
gb_model = GradientBoostingClassifier(n_estimators=200, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5504587155963303
F1 Score: 0.5196477834092513


In [130]:
gb_model = GradientBoostingClassifier(n_estimators=300, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.555045871559633
F1 Score: 0.5281749041477855


In [131]:
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=1, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5665137614678899
F1 Score: 0.5069345828943035


In [132]:
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5596330275229358
F1 Score: 0.5155248640096045


In [133]:
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.555045871559633
F1 Score: 0.5136959045435708


In [134]:
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5596330275229358
F1 Score: 0.5155248640096045


In [135]:
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=0.06, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5573394495412844
F1 Score: 0.5030108950837773


In [136]:
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=0.08, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5527522935779816
F1 Score: 0.5029268100662584


In [137]:
# min_samples_split 기본값은 2, 과적합 방지

gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=2, min_samples_split=2, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5596330275229358
F1 Score: 0.5155248640096045


In [138]:
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=2, min_samples_split=3, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5596330275229358
F1 Score: 0.5155248640096045


In [139]:
# min_samples_leaf 기본값은 1, 값을 증가시키면 각 리프 노드가 더 많은 샘플을 가지게 되어 모델이 더 간단해지는 경향, 과적합 방지

gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=2, min_samples_leaf=1, random_state=42)
gb_model.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.5596330275229358
F1 Score: 0.5155248640096045
