# **Nhiệm vụ 1**

In [1]:
# import libary
import pandas as pd
import numpy as np
from datetime import datetime
import os
import re

# Tạo đường dẫn đến thư mục
RAW_DIR = './data_raw'
CLEAN_DIR = './data_clean'
# Tạo thư mục data clean
os.makedirs(CLEAN_DIR, exist_ok=True)

**1. Chuẩn hóa file team_info.csv**

In [2]:
# Read csv
df_teams = pd.read_csv(f'{RAW_DIR}/team_info.csv')
df_players = pd.read_csv(f'{RAW_DIR}/player_stats.csv')
df_matches = pd.read_csv(f'{RAW_DIR}/match_results.csv')

def normalize_text(df):
    for col in df.columns:
        # Xóa khoảng trắng thừa ở đầu/cuối
        df[col] = df[col].astype(str).str.strip()
        # Xóa khoảng trắng thừa giữa các từ và chuyển về chữ thường
        df[col] = df[col].str.lower().str.replace(r'\s+', ' ', regex=True).str.strip()

    return df

# Thay thế
df_teams = normalize_text(df_teams)
df_players = normalize_text(df_players)
df_matches = normalize_text(df_matches)

#
def clean_team_info(df):
    pattern_map = {
        'tp': 'thanh pho',
        'hcm': 'HCM',
        'hà': 'ha',
        'nội': 'noi',
        'đà': 'da',
        'nẵng': 'nang'
    }
    def to_title(name):
        if name in pattern_map:
            name = pattern_map[name]
        return name.capitalize()
    def normalize_name(name):
        lst = re.findall(r'\w+', name)
        lst = list(map(to_title, lst))
        return ' '.join(lst)
    for columns in df:
        series = df[columns]
        df[columns] = series.apply(normalize_name)
    return df

# Thay thế
df_teams = clean_team_info(df_teams)
# Xuất file đã làm sạch
df_teams.to_csv(f"{CLEAN_DIR}/team_info_clean.csv", index=False)


**2. Chuẩn hóa file match_result.csv**

In [3]:
# 1. Hàm làm sạch cột goals
def clean_goals(series):
    series = series.astype(str)
    number_map = {
        'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
        'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10
    }
    def to_number_value(value):
        value_lower = value.lower()
        if value_lower in number_map:
            return number_map[value_lower]
        try:
            num = int(value)
            return max(0, num)
        except ValueError:
            return np.nan
        
    cleaned_goals = series.apply(to_number_value)
    return cleaned_goals.fillna(0).astype(int)

# Thay thế các cột goals
df_matches['home_goals'] = clean_goals(df_matches['home_goals'])
df_matches['away_goals'] = clean_goals(df_matches['away_goals'])

# 2. Hàm chuẩn hóa cột match_date
def normalize_date(series):
    def parse_date(date_str):
        # Định dạng chuẩn (YYYY-MM-DD)
        try:
            return datetime.strptime(date_str, '%Y-%m-%d').strftime('%Y-%m-%d')
        except ValueError:
            pass
        # Định dạng DD/MM/YYYY
        try:
            return datetime.strptime(date_str, '%d/%m/%Y').strftime('%Y-%m-%d')
        except ValueError:
            pass
        return np.nan
    cleaned_dates = series.apply(parse_date)
    return cleaned_dates.astype(str)

# Thay thế cột match_date
df_matches['match_date'] = normalize_date(df_matches['match_date'])

def to_title(s):
    return s.title()

df_matches['match_id'] = df_matches['match_id'].apply(to_title)
df_matches['home_team_id'] = df_matches['home_team_id'].apply(to_title)
df_matches['away_team_id'] = df_matches['away_team_id'].apply(to_title)

# 3. Xuất file đã làm sạch
df_matches.to_csv(f"{CLEAN_DIR}/match_results_clean.csv", index=False)


**3. Chuẩn hóa file player_stats.csv**

In [4]:
#
df_players['player_id'] = df_players['player_id'].apply(to_title)
df_players['team_id'] = df_players['team_id'].apply(to_title)
df_players['player_name'] = df_players['player_name'].apply(to_title)
df_players['position'] = df_players['position'].apply(lambda p: p.upper())
df_players['goals'] = clean_goals(df_players['goals'])

# Xuất file đã làm sạch
df_players.to_csv(f"{CLEAN_DIR}/player_stats_clean.csv", index=False)