# 1xbet_offensive 데이터를 기준으로
## 연도, 선수 이름, (해당 시즌 기준 이적 후)팀, 포지션 정보를 가지는 테이블 생성
## 고유번호, 해당 년도 나이를 입력을 추가

In [1]:
import pandas as pd

In [2]:
# offensive data csv파일 받아오기
offensive_path = './data/new/1xbet_offensive_edited/1xbet_offensive_{}_edited.csv'

start_year, end_year = 2014, 2022
stats_df = pd.DataFrame()

for year in range(start_year, end_year+1):
    o_df = pd.read_csv(offensive_path.format(year))
    o_df['Year'] = year
    stats_df = pd.concat([stats_df, o_df])

In [3]:
stats_df.head()

Unnamed: 0,Name,Team,Age,Position,Apps,Mins,Goals,Assists,SpG,KeyP,Drb,Fouled,Off,Disp,UnsTch,Rating,Year
0,Eden Hazard,Chelsea,32,Forward,38,3379,14,9,2.052632,2.631579,4.763158,2.973684,0.105263,2.710526,2.105263,7.956842,2014
1,Alexis Sanchez,Arsenal,34,Forward,35,2953,16,8,3.485714,2.342857,3.285714,2.057143,0.228571,3.314286,2.114286,7.810857,2014
2,Sergio Aguero,Man City,35,Forward,33,2540,26,8,4.484848,1.0,2.636364,0.757576,1.0,2.727273,1.939394,7.671515,2014
3,Cesc Fabregas,Chelsea,36,Midfielder,34,2890,3,18,1.264706,2.794118,1.058824,1.058824,0.029412,1.411765,1.0,7.618529,2014
4,Santi Cazorla,Arsenal,38,Midfielder,37,2992,7,11,2.513514,2.108108,2.351351,1.648649,0.054054,1.486486,1.0,7.599459,2014


In [4]:
# 시즌 도중 이적하여 한 년도에 2개의 데이터가 들어있는 선수 예시
stats_df[stats_df['Name'] == 'Theo Walcott'][['Name', 'Team', 'Year']]

Unnamed: 0,Name,Team,Year
339,Theo Walcott,Arsenal,2014
348,Theo Walcott,Arsenal,2015
111,Theo Walcott,Arsenal,2016
49,Theo Walcott,Everton,2017
494,Theo Walcott,Arsenal,2017
322,Theo Walcott,Everton,2018
353,Theo Walcott,Everton,2019
288,Theo Walcott,Southampton,2020
522,Theo Walcott,Everton,2020
499,Theo Walcott,Southampton,2021


In [5]:
# 선수 정보만 가져온 테이블 - 시즌 중 이적한 선수의 데이터를 이적한 팀 기준으로 정리해야 함
player_table = stats_df[['Year', 'Name', 'Age', 'Team', 'Position']]
player_table

# 비교를 위한 원본 데이터 유지
origin_player = player_table

In [6]:
# 이름 별로 나온 횟수
names = player_table.Name.value_counts()

# 최대 9번의 이름만 나와야 정상 -> 10번 이상 나온 선수 목록 추출
dupli_names = names[player_table.Name.value_counts() > 9]

# 시즌 당 복수 데이터가 2번 발생한 선수 2명, 1번 발생한 선수 8명
# Theo Walcott --> 17년도 아스날 -> 에버튼 / 20년도 에버튼 -> 소튼
# Calum Chambers --> 16년도 아스날 -> 미들즈브러 / 21년도 아스날 -> 빌라

In [7]:
# 정리해야 할 선수 목록
dupli_names

Theo Walcott               11
Calum Chambers             11
Danny Welbeck              10
Jeffrey Schlupp            10
Ruben Loftus-Cheek         10
Cedric Soares              10
Nathan Ake                 10
Matt Targett               10
Jonjo Shelvey              10
Alex Oxlade Chamberlain    10
Name: Name, dtype: int64

정리해야 하는 선수들의 경우  
"문제가 있는 년도 팀명 --> 2개  
이전 혹은 이후 년도 팀명 --> 1개"  
의 구조를 따른다.  
이전, 이후 년도 팀명 중 하나를 사용하여 문제가 있는 년도의 팀명을 통합

In [8]:
# 이전 팀명과 비교
def compare_before(target, name):
    # 타켓 년도, 선수
    player = player_table[(player_table['Year'] == target) & (player_table['Name'] == name)]
    # 비교할 이전 시즌 팀명
    compare_team = player_table[(player_table['Year'] ==  (target - 1)) \
                                        & (player_table['Name'] == name)]['Team'].unique()[0]
    # 타켓 년도 두 개의 팀명
    target_team1 = player['Team'].unique()[0]
    target_team2 = player['Team'].unique()[1]
    
    # 작년 소속팀, 올해 두 개의 소속팀이 모두 다르다면 내년 팀과 비교
    if target_team1 != compare_team != target_team2:
        return compare_after(target, name)
    
    # 작년 소속팀과 같으면 인덱스 저장 (삭제를 위해)
    if target_team1 == compare_team:
        idx = player.index.unique()[0]
        return idx

    if target_team2 == compare_team:
        idx = player.index.unique()[1]
        return idx

In [9]:
# 이후 팀명과 비교
def compare_after(target, name):
    # 타켓 년도, 선수
    player = player_table[(player_table['Year'] == target) & (player_table['Name'] == name)]
    # 비교할 이후 시즌 팀명
    compare_team = player_table[(player_table['Year'] ==  (target + 1)) \
                                        & (player_table['Name'] == name)]['Team'].unique()[0]
    # 타켓 년도 두 개의 팀명   
    target_team1 = player['Team'].unique()[0]
    target_team2 = player['Team'].unique()[1]

    # 내년 소속팀과 다르면 인덱스 저장 (삭제를 위해)
    if target_team1 != compare_team:
        idx = player.index.unique()[0]
        return idx

    if target_team2 != compare_team:
        idx = player.index.unique()[1]
        return idx

In [10]:
# 선수 데이터 삭제 함수
def del_player(name):
    # 특정 선수 이름을 통해 중복이 있는 년도를 뽑아 타켓 년도 추출
    player = player_table[player_table['Name'] == name]
    dupli_year = player.groupby('Year')['Name'].count()
    targets = dupli_year[dupli_year > 1].index
    
    idxs = []
    
    # 타켓 년도 별로 선수 데이터 정리
    for target in targets:
        if target == 2014:
            idxs.append(compare_after(target, name))     
        elif target == 2022:
            idxs.append(compare_before(target, name))    
        else:
            idxs.append(compare_before(target, name))
            
    return idxs

In [11]:
# 정리가 필요한 선수 리스트를 순회하며 데이터 정리
for name in dupli_names.index:
    idxs = del_player(name)
    for idx in idxs:
        player_table = player_table.drop(index = idx)

In [12]:
# 전체 데이터에서 10회 이상 나오는 선수가 없음을 확인
names = player_table.Name.value_counts()
names[player_table.Name.value_counts() > 9]

Series([], Name: Name, dtype: int64)

In [13]:
# 전체 데이터에서 10회 이상 나오는 선수가 없음을 확인
player_table.Name.value_counts().sort_values(ascending = False)

James Ward-Prowse          9
James Tomkins              9
Matt Targett               9
Alex Oxlade Chamberlain    9
Jordan Henderson           9
                          ..
Neto                       1
Sven Botman                1
Ivan Perisic               1
Dango Ouattara             1
Andrew Moran               1
Name: Name, Length: 1712, dtype: int64

In [14]:
# 이전 데이터와 현재 데이터 비교
display(origin_player[origin_player['Name'] == 'Danny Welbeck'])
player_table[player_table['Name'] == 'Danny Welbeck']

Unnamed: 0,Year,Name,Age,Team,Position
62,2014,Danny Welbeck,32,Arsenal,Forward
420,2014,Danny Welbeck,32,Man Utd,Forward
94,2015,Danny Welbeck,32,Arsenal,Forward
321,2016,Danny Welbeck,32,Arsenal,Forward
345,2017,Danny Welbeck,32,Arsenal,Forward
376,2018,Danny Welbeck,32,Arsenal,Forward
380,2019,Danny Welbeck,32,Watford,Forward
231,2020,Danny Welbeck,32,Brighton,Forward
216,2021,Danny Welbeck,32,Brighton,Forward
128,2022,Danny Welbeck,32,Brighton,Forward


Unnamed: 0,Year,Name,Age,Team,Position
62,2014,Danny Welbeck,32,Arsenal,Forward
94,2015,Danny Welbeck,32,Arsenal,Forward
321,2016,Danny Welbeck,32,Arsenal,Forward
345,2017,Danny Welbeck,32,Arsenal,Forward
376,2018,Danny Welbeck,32,Arsenal,Forward
380,2019,Danny Welbeck,32,Watford,Forward
231,2020,Danny Welbeck,32,Brighton,Forward
216,2021,Danny Welbeck,32,Brighton,Forward
128,2022,Danny Welbeck,32,Brighton,Forward


In [15]:
# 년도, 이름, 팀, 포지션이 정리 된 상태
player_table

Unnamed: 0,Year,Name,Age,Team,Position
0,2014,Eden Hazard,32,Chelsea,Forward
1,2014,Alexis Sanchez,34,Arsenal,Forward
3,2014,Cesc Fabregas,36,Chelsea,Midfielder
4,2014,Santi Cazorla,38,Arsenal,Midfielder
5,2014,Mesut Ozil,34,Arsenal,Midfielder
...,...,...,...,...,...
564,2022,Dexter Lembikisa,19,Wolves,Defender
565,2022,Cameron Peupion,20,Brighton,Midfielder
566,2022,Andrew Moran,19,Brighton,Midfielder
567,2022,Shane Duffy,31,Fulham,Defender


In [16]:
player_table = player_table.rename(columns = {'Age' : 'Curr_Age'})

In [17]:
# 년도 기준 나이 데이터 생성
player_table['Age'] = player_table['Curr_Age'] - (2023 - player_table['Year'])

In [18]:
# 이름 별로 고유번호 부여하기

# 총 이름 갯수
len(player_table['Name'].unique())

1712

In [19]:
# 이름 순으로 정렬 후 동일한 고유 번호 부여
# 총 이름 갯수와 마지막 고유번호가 동일한 것을 확인할 수 있음
rank_table = pd.DataFrame(player_table['Name'].sort_values())
rank_table['No.'] = rank_table.rank(method = 'dense')
rank_table = rank_table.groupby('Name')['No.'].mean()
rank_df = pd.DataFrame(rank_table.astype('int'))
rank_df

Unnamed: 0_level_0,No.
Name,Unnamed: 1_level_1
Aaron Connolly,1
Aaron Cresswell,2
Aaron Hickey,3
Aaron Lennon,4
Aaron Mooy,5
...,...
Yun Suk-Young,1708
Yves Bissouma,1709
Zack Steffen,1710
Zanka,1711


In [20]:
# 선수 테이블에 고유번호 병합
player_table = pd.merge(player_table, rank_df, how = 'left', left_on = 'Name', right_on = 'Name')
player_table

Unnamed: 0,Year,Name,Curr_Age,Team,Position,Age,No.
0,2014,Eden Hazard,32,Chelsea,Forward,23,444
1,2014,Alexis Sanchez,34,Arsenal,Forward,25,73
2,2014,Cesc Fabregas,36,Chelsea,Midfielder,27,260
3,2014,Santi Cazorla,38,Arsenal,Midfielder,29,1462
4,2014,Mesut Ozil,34,Arsenal,Midfielder,25,1159
...,...,...,...,...,...,...,...
4737,2022,Dexter Lembikisa,19,Wolves,Defender,18,407
4738,2022,Cameron Peupion,20,Brighton,Midfielder,19,246
4739,2022,Andrew Moran,19,Brighton,Midfielder,18,109
4740,2022,Shane Duffy,31,Fulham,Defender,30,1497


In [21]:
# 년도 마다 동일한 선수에 동일한 고유번호가 부여 되었음을 확인
player_table[player_table['Name'] == 'Reece James']

Unnamed: 0,Year,Name,Curr_Age,Team,Position,Age,No.
2711,2019,Reece James,23,Chelsea,Defender,19,1363
3209,2020,Reece James,23,Chelsea,Defender,20,1363
3672,2021,Reece James,23,Chelsea,Defender,21,1363
4212,2022,Reece James,23,Chelsea,Defender,22,1363


In [22]:
# 컬럼 순서 변경 및 정리

player_table = player_table[['No.', 'Year', 'Name', 'Age', 'Team', 'Position']]
player_table

Unnamed: 0,No.,Year,Name,Age,Team,Position
0,444,2014,Eden Hazard,23,Chelsea,Forward
1,73,2014,Alexis Sanchez,25,Arsenal,Forward
2,260,2014,Cesc Fabregas,27,Chelsea,Midfielder
3,1462,2014,Santi Cazorla,29,Arsenal,Midfielder
4,1159,2014,Mesut Ozil,25,Arsenal,Midfielder
...,...,...,...,...,...,...
4737,407,2022,Dexter Lembikisa,18,Wolves,Defender
4738,246,2022,Cameron Peupion,19,Brighton,Midfielder
4739,109,2022,Andrew Moran,18,Brighton,Midfielder
4740,1497,2022,Shane Duffy,30,Fulham,Defender


In [23]:
# 년도 별로 csv 파일 저장
start_year, end_year = 2014, 2022

for year in range(start_year, end_year+1):
    table = player_table[player_table['Year'] == year]
    display(table)
    table.to_csv(path_or_buf = f'./data/new/players_{year}.csv', encoding = 'utf-8')

Unnamed: 0,No.,Year,Name,Age,Team,Position
0,444,2014,Eden Hazard,23,Chelsea,Forward
1,73,2014,Alexis Sanchez,25,Arsenal,Forward
2,260,2014,Cesc Fabregas,27,Chelsea,Midfielder
3,1462,2014,Santi Cazorla,29,Arsenal,Midfielder
4,1159,2014,Mesut Ozil,25,Arsenal,Midfielder
...,...,...,...,...,...,...
530,210,2014,Brandon Comley,18,QPR,Midfielder
531,1636,2014,Valentin Roberge,27,Sunderland,Defender
532,562,2014,Gary Taylor-Fletcher,33,Leicester,Midfielder
533,972,2014,Lee Chung-Yong,26,Crystal Palace,Midfielder


Unnamed: 0,No.,Year,Name,Age,Team,Position
535,1386,2015,Riyad Mahrez,24,Leicester,Forward
536,418,2015,Dimitri Payet,28,West Ham,Midfielder
537,1204,2015,Mousa Dembele,28,Tottenham,Midfielder
538,1159,2015,Mesut Ozil,26,Arsenal,Midfielder
539,1209,2015,N'Golo Kante,24,Leicester,Midfielder
...,...,...,...,...,...,...
1077,1566,2015,Tammy Abraham,17,Chelsea,Forward
1078,660,2015,Ivan Toney,19,Newcastle,Forward
1079,630,2015,Hiram Boateng,19,Crystal Palace,Midfielder
1080,16,2015,Adam Bogdan,27,Liverpool,Goalkeeper


Unnamed: 0,No.,Year,Name,Age,Team,Position
1082,862,2016,Josh Harrop,20,Man Utd,Midfielder
1083,1494,2016,Sergio Romero,29,Man Utd,Goalkeeper
1084,444,2016,Eden Hazard,25,Chelsea,Forward
1085,73,2016,Alexis Sanchez,27,Arsenal,Forward
1086,1313,2016,Paul Pogba,23,Man Utd,Midfielder
...,...,...,...,...,...,...
1606,1109,2016,Matej Vydra,24,Watford,Forward
1607,175,2016,Ben Watson,31,Watford,Midfielder
1608,1674,2016,Will Keane,23,Hull,Forward
1609,697,2016,James Husband,22,Middlesbrough,Defender


Unnamed: 0,No.,Year,Name,Age,Team,Position
1611,1336,2017,Philippe Coutinho,25,Liverpool,Midfielder
1612,1490,2017,Sergio Aguero,29,Man City,Forward
1613,922,2017,Kevin De Bruyne,26,Man City,Midfielder
1614,627,2017,Henri Saivet,26,Newcastle,Midfielder
1615,1193,2017,Mohamed Salah,25,Liverpool,Forward
...,...,...,...,...,...,...
2122,882,2017,Julien Ngoy,19,Stoke,Forward
2123,1167,2017,Michael Hefele,26,Huddersfield,Defender
2124,425,2017,Divock Origi,22,Liverpool,Forward
2125,868,2017,Josh Tymon,18,Stoke,Defender


Unnamed: 0,No.,Year,Name,Age,Team,Position
2127,444,2018,Eden Hazard,27,Chelsea,Forward
2128,1348,2018,Raheem Sterling,23,Man City,Forward
2129,1490,2018,Sergio Aguero,30,Man City,Forward
2130,1650,2018,Virgil van Dijk,27,Liverpool,Defender
2131,1437,2018,Sadio Mane,26,Liverpool,Forward
...,...,...,...,...,...,...
2619,1138,2018,Matty Daly,17,Huddersfield,Midfielder
2620,1644,2018,Victor Moses,27,Chelsea,Midfielder
2621,160,2018,Bakary Sako,30,Crystal Palace,Forward
2622,8,2018,Aaron Rowe,17,Huddersfield,Midfielder


Unnamed: 0,No.,Year,Name,Age,Team,Position
2624,922,2019,Kevin De Bruyne,28,Man City,Midfielder
2625,222,2019,Bruno Fernandes,24,Man Utd,Midfielder
2626,28,2019,Adama Traore,23,Wolves,Forward
2627,1386,2019,Riyad Mahrez,28,Man City,Forward
2628,1437,2019,Sadio Mane,27,Liverpool,Forward
...,...,...,...,...,...,...
3130,982,2019,Leon Clarke,34,Sheff Utd,Forward
3131,353,2019,Danny Drinkwater,29,Aston Villa,Midfielder
3132,201,2019,Borja Baston,27,Aston Villa,Forward
3133,1514,2019,Simon Moore,29,Sheff Utd,Goalkeeper


Unnamed: 0,No.,Year,Name,Age,Team,Position
3135,613,2020,Harry Kane,27,Tottenham,Forward
3136,922,2020,Kevin De Bruyne,29,Man City,Midfielder
3137,222,2020,Bruno Fernandes,25,Man Utd,Midfielder
3138,1610,2020,Tomas Soucek,25,West Ham,Midfielder
3139,1107,2020,Mason Mount,21,Chelsea,Midfielder
...,...,...,...,...,...,...
3649,81,2020,Allan Tchaptchet,18,Southampton,Defender
3650,1262,2020,Odion Ighalo,31,Man Utd,Forward
3651,68,2020,Alexandre Jankewitz,18,Southampton,Midfielder
3652,1684,2020,Willy Caballero,38,Chelsea,Goalkeeper


Unnamed: 0,No.,Year,Name,Age,Team,Position
3654,922,2021,Kevin De Bruyne,30,Man City,Midfielder
3655,1193,2021,Mohamed Salah,29,Liverpool,Forward
3656,1522,2021,Son Heung-Min,29,Tottenham,Forward
3657,915,2021,Kenedy,25,Chelsea,Midfielder
3658,613,2021,Harry Kane,28,Tottenham,Forward
...,...,...,...,...,...,...
4182,543,2021,Freddie Woodman,24,Newcastle,Goalkeeper
4183,1474,2021,Sead Kolasinac,28,Arsenal,Defender
4184,652,2021,Isaac Price,17,Everton,Midfielder
4185,814,2021,Jonas Lossl,32,Brentford,Goalkeeper


Unnamed: 0,No.,Year,Name,Age,Team,Position
4187,151,2022,Asmir Begovic,35,Everton,Goalkeeper
4188,922,2022,Kevin De Bruyne,31,Man City,Midfielder
4189,486,2022,Erling Haaland,22,Man City,Forward
4190,613,2022,Harry Kane,29,Tottenham,Forward
4191,222,2022,Bruno Fernandes,27,Man Utd,Midfielder
...,...,...,...,...,...,...
4737,407,2022,Dexter Lembikisa,18,Wolves,Defender
4738,246,2022,Cameron Peupion,19,Brighton,Midfielder
4739,109,2022,Andrew Moran,18,Brighton,Midfielder
4740,1497,2022,Shane Duffy,30,Fulham,Defender
