In [1]:
import pandas as pd

# Đọc file CSV
df_matches = pd.read_csv('data_crawl/match_all_season.csv')

# Xóa ký hiệu % và chuyển thành số
df_matches['possession_time_home'] = df_matches['possession_time_home'].str.rstrip('%').astype('float') / 100
df_matches['possession_time_away'] = df_matches['possession_time_away'].str.rstrip('%').astype('float') / 100


# Lấy danh sách tất cả các cột trừ 'home_name' và 'away_name'
cols_to_convert = df_matches.columns.difference(['home_name', 'away_name', 'date'])


# Áp dụng chuyển đổi các cột này sang kiểu số, lỗi sẽ được chuyển thành NaN nếu có
df_matches[cols_to_convert] = df_matches[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Kiểm tra lại kiểu dữ liệu các cột sau khi chuyển đổi
print(df_matches.dtypes)



home_name                object
away_name                object
home_score              float64
away_score              float64
shots_home              float64
shots_away              float64
shots_on_goal_home      float64
shots_on_goal_away      float64
passes_home             float64
passes_away             float64
accurate_passes_home    float64
accurate_passes_away    float64
fouls_home              float64
fouls_away              float64
yellow_cards_home       float64
yellow_cards_away       float64
possession_time_home    float64
possession_time_away    float64
red_cards_home          float64
red_cards_away          float64
date                     object
dtype: object


In [2]:
df_matches

Unnamed: 0,home_name,away_name,home_score,away_score,shots_home,shots_away,shots_on_goal_home,shots_on_goal_away,passes_home,passes_away,...,accurate_passes_away,fouls_home,fouls_away,yellow_cards_home,yellow_cards_away,possession_time_home,possession_time_away,red_cards_home,red_cards_away,date
0,Manchester United,Chelsea,4.0,0.0,11.0,18.0,5.0,7.0,449.0,523.0,...,442.0,15.0,13.0,3.0,4.0,0.46,0.54,,,08.11.2019 15:30
1,Leicester City,Wolverhampton Wanderers,0.0,0.0,15.0,8.0,1.0,2.0,666.0,290.0,...,202.0,3.0,13.0,0.0,2.0,0.70,0.30,,,08.11.2019 13:00
2,Newcastle United,Arsenal,0.0,1.0,9.0,8.0,2.0,2.0,380.0,616.0,...,524.0,12.0,7.0,1.0,3.0,0.38,0.62,,,08.11.2019 13:00
3,Tottenham Hotspur,Aston Villa,3.0,1.0,31.0,7.0,7.0,4.0,587.0,262.0,...,200.0,13.0,9.0,1.0,0.0,0.69,0.31,,,08.10.2019 16:30
4,AFC Bournemouth,Sheffield United,1.0,1.0,13.0,8.0,3.0,3.0,487.0,429.0,...,332.0,10.0,19.0,2.0,1.0,0.53,0.47,,,08.10.2019 14:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,Crystal Palace,Aston Villa,5.0,0.0,15.0,8.0,9.0,2.0,552.0,463.0,...,401.0,10.0,8.0,1.0,4.0,0.54,0.46,,,05.19.2024 15:00
1890,Liverpool,Wolverhampton Wanderers,2.0,0.0,36.0,4.0,14.0,3.0,621.0,303.0,...,231.0,14.0,10.0,1.0,1.0,0.67,0.33,0.0,1.0,05.19.2024 15:00
1891,Luton Town,Fulham,2.0,4.0,15.0,16.0,6.0,8.0,304.0,460.0,...,391.0,15.0,20.0,5.0,4.0,0.40,0.60,,,05.19.2024 15:00
1892,Manchester City,West Ham United,3.0,1.0,28.0,3.0,12.0,2.0,782.0,307.0,...,246.0,3.0,12.0,0.0,1.0,0.71,0.29,,,05.19.2024 15:00


In [3]:
# check 2 cột red_cards_home và red_cards_away nếu có null thì cho là số 0
df_matches['red_cards_home'] = df_matches['red_cards_home'].fillna(0)
df_matches['red_cards_away'] = df_matches['red_cards_away'].fillna(0)
df_matches['yellow_cards_home'] = df_matches['yellow_cards_home'].fillna(0)
df_matches['yellow_cards_away'] = df_matches['yellow_cards_away'].fillna(0)

In [4]:
# Fill các giá trị null bằng giá trị trung bình của từng cột số
numeric_cols = df_matches.select_dtypes(include=['float64', 'int64']).columns

# Điền giá trị trung bình cho các cột có kiểu số
for col in numeric_cols:
    df_matches[col] = df_matches[col].fillna(df_matches[col].mean())

# Kiểm tra lại các dòng còn null (nếu có)
print(df_matches.isnull().sum())


home_name               0
away_name               0
home_score              0
away_score              0
shots_home              0
shots_away              0
shots_on_goal_home      0
shots_on_goal_away      0
passes_home             0
passes_away             0
accurate_passes_home    0
accurate_passes_away    0
fouls_home              0
fouls_away              0
yellow_cards_home       0
yellow_cards_away       0
possession_time_home    0
possession_time_away    0
red_cards_home          0
red_cards_away          0
date                    0
dtype: int64


In [5]:
# Xóa phần giờ và chỉ giữ lại ngày tháng
df_matches['date'] = df_matches['date'].str.split(' ').str[0]

# Chuyển cột 'date' thành kiểu datetime, bỏ qua những giá trị không hợp lệ
df_matches['date'] = pd.to_datetime(df_matches['date'], format='%d.%m.%Y', errors='coerce')

# Thay thế NaT bằng ngày của dòng trước đó
df_matches['date'] = df_matches['date'].fillna(method='ffill')

# Đổi định dạng sang DD-MM-YYYY
df_matches['date'] = df_matches['date'].dt.strftime('%d-%m-%Y')

df_matches


  df_matches['date'] = df_matches['date'].fillna(method='ffill')


Unnamed: 0,home_name,away_name,home_score,away_score,shots_home,shots_away,shots_on_goal_home,shots_on_goal_away,passes_home,passes_away,...,accurate_passes_away,fouls_home,fouls_away,yellow_cards_home,yellow_cards_away,possession_time_home,possession_time_away,red_cards_home,red_cards_away,date
0,Manchester United,Chelsea,4.0,0.0,11.0,18.0,5.0,7.0,449.0,523.0,...,442.0,15.0,13.0,3.0,4.0,0.46,0.54,0.0,0.0,08-11-2019
1,Leicester City,Wolverhampton Wanderers,0.0,0.0,15.0,8.0,1.0,2.0,666.0,290.0,...,202.0,3.0,13.0,0.0,2.0,0.70,0.30,0.0,0.0,08-11-2019
2,Newcastle United,Arsenal,0.0,1.0,9.0,8.0,2.0,2.0,380.0,616.0,...,524.0,12.0,7.0,1.0,3.0,0.38,0.62,0.0,0.0,08-11-2019
3,Tottenham Hotspur,Aston Villa,3.0,1.0,31.0,7.0,7.0,4.0,587.0,262.0,...,200.0,13.0,9.0,1.0,0.0,0.69,0.31,0.0,0.0,08-10-2019
4,AFC Bournemouth,Sheffield United,1.0,1.0,13.0,8.0,3.0,3.0,487.0,429.0,...,332.0,10.0,19.0,2.0,1.0,0.53,0.47,0.0,0.0,08-10-2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,Crystal Palace,Aston Villa,5.0,0.0,15.0,8.0,9.0,2.0,552.0,463.0,...,401.0,10.0,8.0,1.0,4.0,0.54,0.46,0.0,0.0,05-11-2024
1890,Liverpool,Wolverhampton Wanderers,2.0,0.0,36.0,4.0,14.0,3.0,621.0,303.0,...,231.0,14.0,10.0,1.0,1.0,0.67,0.33,0.0,1.0,05-11-2024
1891,Luton Town,Fulham,2.0,4.0,15.0,16.0,6.0,8.0,304.0,460.0,...,391.0,15.0,20.0,5.0,4.0,0.40,0.60,0.0,0.0,05-11-2024
1892,Manchester City,West Ham United,3.0,1.0,28.0,3.0,12.0,2.0,782.0,307.0,...,246.0,3.0,12.0,0.0,1.0,0.71,0.29,0.0,0.0,05-11-2024


# **tạo các quan hệ**

# **bộ points**

In [6]:
import pandas as pd

# Đọc file CSV
df = pd.read_csv('data_crawl/points_all_season.csv')

# Tạo bảng Teams (danh sách đội bóng duy nhất)
teams_df = df[['TeamName']].drop_duplicates().reset_index(drop=True)
teams_df['TeamID'] = teams_df.index + 1

print(teams_df.head())

# Lưu bảng Teams vào file CSV
teams_df.to_csv('data_clean/teams.csv', index=False)

            TeamName  TeamID
0          Liverpool       1
1    Manchester City       2
2  Manchester United       3
3            Chelsea       4
4     Leicester City       5


In [7]:
# Tạo bảng Seasons (danh sách các mùa giải duy nhất)
seasons_df = df[['Year']].drop_duplicates().reset_index(drop=True)
seasons_df['SeasonID'] = seasons_df.index + 1
seasons_df

# Lưu bảng Seasons vào file CSV
seasons_df.to_csv('data_clean/seasons.csv', index=False)

In [8]:
# Kết hợp TeamID vào DataFrame chính
df = df.merge(teams_df, on='TeamName', how='left')

# Kết hợp SeasonID vào DataFrame chính
df = df.merge(seasons_df, on='Year', how='left')

# Tạo bảng TeamStats với các giá trị thống kê
team_stats_df = df[['TeamID', 'SeasonID', 'Played', 'Won', 'Draw', 'Lost', 'GF', 'GA', 'Goal Difference', 
                    'Points', 'PPG', '% Points']]

print(team_stats_df.head())

# Lưu bảng TeamStats vào file CSV
team_stats_df.to_csv('data_clean/team_stats.csv', index=False)

   TeamID  SeasonID  Played  Won  Draw  Lost   GF  GA  Goal Difference  \
0       1         1      38   32     3     3   85  33               52   
1       2         1      38   26     3     9  102  35               67   
2       3         1      38   18    12     8   66  36               30   
3       4         1      38   20     6    12   69  54               15   
4       5         1      38   18     8    12   67  41               26   

   Points   PPG  % Points  
0      99  2.61     86.84  
1      81  2.13     71.05  
2      66  1.74     57.89  
3      66  1.74     57.89  
4      62  1.63     54.39  


# **bộ match**

In [9]:
# Merge HomeTeamID từ bảng Teams
df_matches = df_matches.merge(teams_df[['TeamID', 'TeamName']], left_on='home_name', right_on='TeamName', how='left')
df_matches.rename(columns={'TeamID': 'HomeTeamID'}, inplace=True)

# Merge AwayTeamID từ bảng Teams
df_matches = df_matches.merge(teams_df[['TeamID', 'TeamName']], left_on='away_name', right_on='TeamName', how='left')
df_matches.rename(columns={'TeamID': 'AwayTeamID'}, inplace=True)

# Xóa các cột không cần thiết sau khi merge
df_matches.drop(columns=['TeamName_x', 'TeamName_y'], inplace=True)


In [11]:
df_matches

Unnamed: 0,home_name,away_name,home_score,away_score,shots_home,shots_away,shots_on_goal_home,shots_on_goal_away,passes_home,passes_away,...,fouls_away,yellow_cards_home,yellow_cards_away,possession_time_home,possession_time_away,red_cards_home,red_cards_away,date,HomeTeamID,AwayTeamID
0,Manchester United,Chelsea,4.0,0.0,11.0,18.0,5.0,7.0,449.0,523.0,...,13.0,3.0,4.0,0.46,0.54,0.0,0.0,08-11-2019,3,4
1,Leicester City,Wolverhampton Wanderers,0.0,0.0,15.0,8.0,1.0,2.0,666.0,290.0,...,13.0,0.0,2.0,0.70,0.30,0.0,0.0,08-11-2019,5,7
2,Newcastle United,Arsenal,0.0,1.0,9.0,8.0,2.0,2.0,380.0,616.0,...,7.0,1.0,3.0,0.38,0.62,0.0,0.0,08-11-2019,13,8
3,Tottenham Hotspur,Aston Villa,3.0,1.0,31.0,7.0,7.0,4.0,587.0,262.0,...,9.0,1.0,0.0,0.69,0.31,0.0,0.0,08-10-2019,6,17
4,AFC Bournemouth,Sheffield United,1.0,1.0,13.0,8.0,3.0,3.0,487.0,429.0,...,19.0,2.0,1.0,0.53,0.47,0.0,0.0,08-10-2019,18,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,Crystal Palace,Aston Villa,5.0,0.0,15.0,8.0,9.0,2.0,552.0,463.0,...,8.0,1.0,4.0,0.54,0.46,0.0,0.0,05-11-2024,14,17
1890,Liverpool,Wolverhampton Wanderers,2.0,0.0,36.0,4.0,14.0,3.0,621.0,303.0,...,10.0,1.0,1.0,0.67,0.33,0.0,1.0,05-11-2024,1,7
1891,Luton Town,Fulham,2.0,4.0,15.0,16.0,6.0,8.0,304.0,460.0,...,20.0,5.0,4.0,0.40,0.60,0.0,0.0,05-11-2024,26,22
1892,Manchester City,West Ham United,3.0,1.0,28.0,3.0,12.0,2.0,782.0,307.0,...,12.0,0.0,1.0,0.71,0.29,0.0,0.0,05-11-2024,2,16


In [12]:
# 2. Đánh nhãn cho kết quả trận đấu dựa vào HomeScore và AwayScore
def assign_label(row):
    if row['home_score'] > row['away_score']:
        return 1  # Đội nhà thắng
    elif row['home_score'] < row['away_score']:
        return -1  # Đội nhà thua
    else:
        return 0  # Hòa

# Áp dụng hàm để tạo cột 'label'
df_matches['label'] = df_matches.apply(assign_label, axis=1)


In [14]:
df_matches

#ghi ra file csv
df_matches.to_csv('data_clean/matches.csv', index=False)