# 爬取数据

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# 定义马匹页面 URL 和对应的马数类型
horse_types = {
    '二字馬': '2',
    '三字馬': '3',
    '四字馬': '4'
}

# 确定桌面路径
desktop_path_info = "/Users/Keanu/Desktop/horses_info.csv"  # 改为您的桌面路径
desktop_path_records = "/Users/Keanu/Desktop/horses_race_records.csv"  # 改为您的桌面路径

# 存储所有马的信息
all_horse_info = []
all_race_records = []

# 遍历所有马匹类型
for horse_name, ordertype in horse_types.items():
    trainer_url = f'https://racing.hkjc.com/racing/information/chinese/Horse/SelectHorsebyChar.aspx?ordertype={ordertype}'
    
    # 发送 HTTP GET 请求以获取马匹列表
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(trainer_url, headers=headers)

    # 检查响应状态
    if response.status_code == 200:
        # 解析 HTML 内容
        soup = BeautifulSoup(response.text, 'html.parser')

        # 调试：打印页面内容前500字符，确认页面是否正确加载
        print(soup.prettify()[:500])

        # 提取所有马匹的信息
        horse_links = []

        # 查找当前马匹类型部分
        horse_table = soup.find('td', class_='subheader', string=lambda text: text and horse_name in text)
        if horse_table:
            # 找到包含具体马匹信息的表格
            inner_table = horse_table.find_next('table')
            if inner_table:
                rows = inner_table.find_all('tr')
                
                for row in rows:
                    cells = row.find_all('td')
                    for cell in cells:
                        link = cell.find('a')
                        if link:
                            horse_name = link.get_text(strip=True)
                            horse_url = 'https://racing.hkjc.com' + link['href']  # 完整链接
                            horse_links.append((horse_name, horse_url))

        print(f'{horse_name} 找到的马匹数量: {len(horse_links)}')

        # 如果找到的马匹数量为0，继续下一次循环
        if len(horse_links) == 0:
            continue

        # 遍历每匹马并提取详细信息
        unique_horse_names = set()
        for horse_name, horse_url in horse_links:
            if horse_name in unique_horse_names:
                continue  # 如果马匹名称已存在，跳过
            unique_horse_names.add(horse_name)  # 添加到集合中，以防重复

            horse_response = requests.get(horse_url, headers=headers)
            
            if horse_response.status_code == 200:
                horse_soup = BeautifulSoup(horse_response.text, 'html.parser')

                # 解析马匹基本信息
                horse_info = {'马匹名称': horse_name}

                details = horse_soup.find_all('table', class_='table_top_right table_eng_text')
                for detail in details:
                    rows = detail.find_all('tr')
                    for row in rows:
                        cols = row.find_all('td')
                        if len(cols) >= 3:
                            key = cols[0].get_text(strip=True)
                            value = cols[2].get_text(strip=True)
                            horse_info[key] = value

                # 保存基本信息
                all_horse_info.append(horse_info)

                # 提取近三季往绩记录
                performance_table = horse_soup.find('table', class_='bigborder')
                if performance_table:
                    records = performance_table.find_all('tr')[2:]  # 跳过前两行表头

                    for record in records:
                        cells = record.find_all('td')
                        if len(cells) >= 17:  # 确保有足够的单元格
                            race_info = {
                                '马匹名称': horse_name,  # 每马的名称只出现一次
                                '场次': cells[0].get_text(strip=True),
                                '名次': cells[1].get_text(strip=True),
                                '日期': cells[2].get_text(strip=True),
                                '赛道': cells[3].get_text(strip=True),
                                '途程': cells[4].get_text(strip=True),
                                '场地状况': cells[5].get_text(strip=True),
                                '赛事班次': cells[6].get_text(strip=True),
                                '档位': cells[7].get_text(strip=True),
                                '评分': cells[8].get_text(strip=True),
                                '练马师': cells[9].get_text(strip=True),
                                '骑师': cells[10].get_text(strip=True),
                                '头马距离': cells[11].get_text(strip=True),
                                '独赢谱率': cells[12].get_text(strip=True),
                                '实际负磅': cells[13].get_text(strip=True),
                                '沿途走位': cells[14].get_text(strip=True),
                                '完成时间': cells[15].get_text(strip=True),
                                '排位体重': cells[16].get_text(strip=True),
                                '配备': cells[17].get_text(strip=True) if len(cells) > 17 else '',
                                '赛事重播': cells[18].get_text(strip=True) if len(cells) > 18 else ''
                            }
                            all_race_records.append(race_info)  # 保存赛绩记录
            
            # 添加延迟，避免请求过于频繁
            time.sleep(1)

# 将数据保存到 DataFrame
all_horse_info_df = pd.DataFrame(all_horse_info)
all_race_records_df = pd.DataFrame(all_race_records)

# 保存为 CSV 文件
all_horse_info_df.to_csv(desktop_path_info, index=False)
all_race_records_df.to_csv(desktop_path_records, index=False)

print("数据已成功保存到桌面！")


# Step1. 数据格式转换

In [None]:
import pandas as pd

# 读取csv文件
file_path = '/Users/Keanu/Desktop/racing_data/csv/HK_Horses.csv'
df = pd.read_csv(file_path)

# 定义一个函数来处理"冠-亞-季-總出賽次數*"列
def process_race_counts(race_counts_str):
    if pd.isna(race_counts_str):  # 处理缺失值
        return 0, 0, 0, 0  # 返回默认值

    try:
        parts = race_counts_str.split('-')
        if len(parts) == 4:
          wins = int(parts[0])
          seconds = int(parts[1])
          thirds = int(parts[2])
          total_races = int(parts[3].replace("*", "")) # Remove '*' and convert to int
          return wins, seconds, thirds, total_races
        else:
          return 0, 0, 0, 0 # handle cases with incorrect format
    except ValueError: # handle cases with non-numeric data
        return 0, 0, 0, 0


# 应用函数处理"冠-亞-季-總出賽次數*"列，并创建新列
df[['wins', 'seconds', 'thirds', 'total_races']] = df['冠-亞-季-總出賽次數*'].apply(lambda x: pd.Series(process_race_counts(x)))

# 删除原始的 '冠-亞-季-總出賽次數*' 列
df = df.drop('冠-亞-季-總出賽次數*', axis=1)

# 保存新的DataFrame到csv文件
new_file_path = '/Users/Keanu/Desktop/processed_data.csv'
df.to_csv(new_file_path, index=False)

print(f"Processed data saved to {new_file_path}")

# Step2. 处理缺失值

In [None]:
import pandas as pd

# Load the csv file
file_path = '/Users/Keanu/Desktop/processed_data.csv'
data = pd.read_csv(file_path)

# Check for missing values in each column
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Extract rows with missing values
rows_with_missing = data[data.isnull().any(axis=1)]

# Save the rows with missing values to a new CSV file
rows_with_missing.to_csv('/Users/Keanu/Desktop/rows_with_missing_values.csv', index=False)

print("Rows with missing values have been saved as 'rows_with_missing_values.csv'")

# Step3. 清洗数据

In [None]:
import pandas as pd

# Load the csv file
file_path = '/Users/Keanu/Desktop/processed_data.csv'
data = pd.read_csv(file_path)

# Remove the specified column
data = data.drop(columns=['自購馬來港前賽事片段'])

# Drop rows with any other missing values
data = data.dropna()

# Save the cleaned data to a new CSV file
output_file_path = '/Users/Keanu/Desktop/cleaned_data.csv'
data.to_csv(output_file_path, index=False)

# Check for missing values in each column
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)
