In [8]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [37]:
train = pd.read_csv('/home/workspace/DACON/soccer/Data/train.csv')
test = pd.read_csv('/home/workspace/DACON/soccer/Data/test.csv')

train['year'] = train['date'].apply(lambda x : int(x[0:4]))
train['month'] = train['date'].apply(lambda x : int(x[5:7]))
train['day'] = train['date'].apply(lambda x : int(x[8:10]))
train.drop(columns=['date'], inplace=True)

test['year'] = test['date'].apply(lambda x : int(x[0:4]))
test['month'] = test['date'].apply(lambda x : int(x[5:7]))
test['day'] = test['date'].apply(lambda x : int(x[8:10]))
test.drop(columns=['date'], inplace=True)

In [45]:
train['home_win'] = train['result'].apply(lambda x: 1 if x=='H' else 0) 
dic = {}
for team in train['homeTeam'].unique():
    value = train[train['homeTeam'] == team]['home_win'].sum()
    dic[team] = value

label_dic={}
for idx, (team, _) in enumerate(sorted(dic.items(), key= lambda x: x[1])):
    label_dic[team] = idx



{'Luton': 0,
 'Coventry': 1,
 'Bradford': 2,
 'Blackpool': 3,
 'Huddersfield': 4,
 'Middlesboro': 5,
 'Cardiff': 6,
 "Nott'm Forest": 7,
 'Derby': 8,
 'QPR': 9,
 'Ipswich': 10,
 'Brentford': 11,
 'Reading': 12,
 'Sheffield United': 13,
 'Hull': 14,
 'Watford': 15,
 'Norwich': 16,
 'Bournemouth': 17,
 'Brighton': 18,
 'Wigan': 19,
 'Leeds': 20,
 'Birmingham': 21,
 'Swansea': 22,
 'Burnley': 23,
 'Charlton': 24,
 'Portsmouth': 25,
 'Middlesbrough': 26,
 'Wolves': 27,
 'Crystal Palace': 28,
 'West Brom': 29,
 'Sunderland': 30,
 'Bolton': 31,
 'Stoke': 32,
 'Blackburn': 33,
 'Leicester': 34,
 'Southampton': 35,
 'Fulham': 36,
 'Aston Villa': 37,
 'West Ham': 38,
 'Newcastle': 39,
 'Everton': 40,
 'Tottenham': 41,
 'Man City': 42,
 'Chelsea': 43,
 'Liverpool': 44,
 'Arsenal': 45,
 'Man United': 46}

In [11]:
# 빠진 것 matchID season date result goals(homeTeam) goals(awayTeam) homeTeam awayTeam	
stats_columns = [
'halfTimeGoals(homeTeam)',
'halfTimeGoals(awayTeam)',
'shots(homeTeam)',
'shots(awayTeam)',
'shotsOnTarget(homeTeam)',
'shotsOnTarget(awayTeam)',
'corners(homeTeam)',
'corners(awayTeam)',
'fouls(homeTeam)',
'fouls(awayTeam)',
'yellowCards(homeTeam)',
'yellowCards(awayTeam)',
'redCards(homeTeam)',
'redCards(awayTeam)'
]

train['match'] = train['homeTeam'] + '-' + train['awayTeam']
pair_stats = train.groupby('match')[stats_columns].mean().reset_index() # match mean

# test_with_stats
test['match'] = test['homeTeam'] + '-' + test['awayTeam']
test_with_stats = test.merge(pair_stats, on='match', how='left')
test_with_stats.fillna(pair_stats[stats_columns].mean(), inplace=True) # pair_stats mean

In [13]:
train_x = train.drop(columns=['matchID', 'goals(homeTeam)', 'goals(awayTeam)', 'result'])
train_y = train['result']

test_x = test_with_stats.drop(columns=['matchID'])
test_x = test_x[train_x.columns]

In [14]:
from sklearn.preprocessing import LabelEncoder

encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

for i in encoding_target:
    le = LabelEncoder()
    le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    for case in np.unique(test_x[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case)
    
    test_x[i] = le.transform(test_x[i])

In [15]:
model = LogisticRegression(max_iter=100,
                           penalty='l2',
                           C=1.0)

In [16]:
model.fit(train_x, train_y) 
prediction = model.predict_proba(test_x)

display(model.classes_)
display(prediction)

array(['A', 'D', 'H'], dtype=object)

array([[0.17876405, 0.21354052, 0.60769543],
       [0.20650164, 0.28984317, 0.5036552 ],
       [0.28672953, 0.2537285 , 0.45954197],
       [0.25356692, 0.24503775, 0.50139532],
       [0.09457494, 0.16074348, 0.74468158],
       [0.31671117, 0.29634308, 0.38694575],
       [0.07449231, 0.1887201 , 0.7367876 ],
       [0.19793407, 0.20595047, 0.59611546],
       [0.23228718, 0.2504719 , 0.51724092],
       [0.16771984, 0.24178462, 0.59049553],
       [0.2944594 , 0.26034664, 0.44519396],
       [0.27203637, 0.22763796, 0.50032567],
       [0.2865389 , 0.26829049, 0.44517061],
       [0.22549584, 0.2715879 , 0.50291625],
       [0.36661582, 0.23357632, 0.39980786],
       [0.2384537 , 0.28719604, 0.47435026],
       [0.24900712, 0.26091305, 0.49007983],
       [0.12118734, 0.19087634, 0.68793632],
       [0.3310222 , 0.296161  , 0.3728168 ],
       [0.29613095, 0.25528917, 0.44857988],
       [0.31860418, 0.2763683 , 0.40502751],
       [0.22855246, 0.33048752, 0.44096002],
       [0.

In [17]:
sample_submission = pd.read_csv('/home/workspace/DACON/soccer/Data/sample_submission.csv')
sample_submission

sample_submission.iloc[:,1:] = prediction
sample_submission

sample_submission.to_csv('baseline_submission.csv', index=False)

In [18]:
# 88 경기
test_x

Unnamed: 0,season,homeTeam,awayTeam,halfTimeGoals(homeTeam),halfTimeGoals(awayTeam),shots(homeTeam),shots(awayTeam),shotsOnTarget(homeTeam),shotsOnTarget(awayTeam),corners(homeTeam),...,fouls(homeTeam),fouls(awayTeam),yellowCards(homeTeam),yellowCards(awayTeam),redCards(homeTeam),redCards(awayTeam),year,month,day,match
0,202324,27,17,0.826087,0.260870,16.956522,9.000000,7.826087,3.826087,7.217391,...,11.130435,11.217391,1.086957,1.565217,0.000000,0.086957,2024,3,9,905
1,202324,6,36,0.000000,0.000000,13.000000,8.000000,3.000000,3.000000,3.000000,...,10.000000,19.000000,2.000000,1.000000,0.000000,0.000000,2024,3,9,236
2,202324,15,25,0.671383,0.506653,13.615100,10.719988,5.996081,4.681511,6.105278,...,11.370943,11.912635,1.410986,1.746479,0.059338,0.084363,2024,3,9,1602
3,202324,46,18,0.714286,0.000000,13.285714,8.571429,5.142857,3.142857,5.571429,...,11.571429,13.000000,1.571429,1.714286,0.000000,0.000000,2024,3,9,1575
4,202324,0,8,0.000000,0.000000,23.500000,7.500000,7.500000,2.000000,10.500000,...,9.500000,7.000000,0.500000,1.500000,0.000000,0.000000,2024,3,9,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,202324,36,32,0.671383,0.506653,13.615100,10.719988,5.996081,4.681511,6.105278,...,11.370943,11.912635,1.410986,1.746479,0.059338,0.084363,2024,5,4,1608
84,202324,25,17,0.671383,0.506653,13.615100,10.719988,5.996081,4.681511,6.105278,...,11.370943,11.912635,1.410986,1.746479,0.059338,0.084363,2024,5,4,1605
85,202324,10,30,0.500000,0.500000,13.500000,11.166667,4.000000,3.833333,6.166667,...,10.166667,11.000000,1.166667,0.833333,0.000000,0.000000,2024,5,4,339
86,202324,8,18,1.000000,1.000000,14.000000,9.000000,6.000000,5.000000,2.000000,...,12.000000,17.000000,0.000000,3.000000,0.000000,0.000000,2024,5,4,273
