In [18]:
import pandas as pd
import numpy as np

In [19]:
athlete_data = pd.read_csv("summerOly_athletes.csv")
no_medal_countries = pd.read_csv('No_Medal_Countries.csv') 
no_medal_countries = no_medal_countries[no_medal_countries['0'] == 0].loc[:, 'NOC']

athlete_data['NOC'] = athlete_data['NOC'].replace('URS', 'RUS')
athlete_data['NOC'] = athlete_data['NOC'].replace('ROC', 'RUS')
athlete_data['NOC'] = athlete_data['NOC'].replace('AIN', 'RUS')
athlete_data = athlete_data[athlete_data['NOC'] != 'RUS']
athlete_data = athlete_data[athlete_data['NOC'] != 'GDR']
athlete_data = athlete_data[athlete_data['NOC'] != 'FRG']
athlete_data = athlete_data[athlete_data['NOC'] != 'EUN']
athlete_data = athlete_data[athlete_data['NOC'] != 'UNK']
athlete_data = athlete_data[athlete_data['NOC'] != 'YMD']
athlete_data = athlete_data[athlete_data['NOC'] != 'YAR']
athlete_data = athlete_data[athlete_data['NOC'] != 'VNM']
athlete_data = athlete_data[athlete_data['NOC'] != 'SAA']
athlete_data = athlete_data[athlete_data['NOC'] != 'RHO']
athlete_data = athlete_data[athlete_data['NOC'] != 'ROT']
athlete_data = athlete_data[athlete_data['NOC'] != 'NFL']
athlete_data = athlete_data[athlete_data['NOC'] != 'NBO']
athlete_data = athlete_data[athlete_data['NOC'] != 'MAL']
athlete_data = athlete_data[athlete_data['NOC'] != 'CRT']

athlete_data = athlete_data.drop_duplicates(subset=['NOC', 'Year', 'Event', 'Medal'])

athlete_data.head(30)

Unnamed: 0,Name,Sex,Team,NOC,Year,City,Sport,Event,Medal
0,A Dijiang,M,China,CHN,1992,Barcelona,Basketball,Basketball Men's Basketball,No medal
1,A Lamusi,M,China,CHN,2012,London,Judo,Judo Men's Extra-Lightweight,No medal
2,Gunnar Aaby,M,Denmark,DEN,1920,Antwerpen,Football,Football Men's Football,No medal
3,Edgar Aabye,M,Denmark/Sweden,DEN,1900,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,Cornelia (-strannood),F,Netherlands,NED,1932,Los Angeles,Athletics,Athletics Women's 100 metres,No medal
5,Cornelia (-strannood),F,Netherlands,NED,1932,Los Angeles,Athletics,Athletics Women's 4 x 100 metres Relay,No medal
6,Einar Aalto,M,Finland,FIN,1952,Helsinki,Swimming,Swimming Men's 400 metres Freestyle,No medal
7,Jyri Aalto,M,Finland,FIN,2000,Sydney,Badminton,Badminton Men's Singles,No medal
8,Minna Aalto,F,Finland,FIN,1996,Atlanta,Sailing,Sailing Women's Windsurfer,No medal
9,Minna Aalto,F,Finland,FIN,2000,Sydney,Sailing,Sailing Women's Windsurfer,No medal


In [20]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer


# 数据预处理函数
def preprocess_data(athlete_data, no_medal_countries):
    # 获取2000年后首次获得奖牌的国家及其参赛项目
    first_medal_events = find_countries_with_first_medal_after_2000(athlete_data)
    
    # 筛选从未获得过奖牌的国家的数据
    no_medal_participation_counts = filter_no_medal_countries(athlete_data, no_medal_countries)
    
    # 添加新特征：是否参与了2000年后首次获得奖牌的国家的项目
    no_medal_participation_counts = add_participation_feature(no_medal_participation_counts, first_medal_events)
    
    # 筛选之前有相似情况但已经获得过奖牌的国家的数据
    similar_cases = find_similar_cases(athlete_data, no_medal_countries)
    
    if similar_cases.empty:
        print("No similar cases found with medals. Cannot fit the model.")
        return None, None
    
    # 合并未获得奖牌的国家数据和已获得奖牌的国家数据
    combined_data = pd.concat([similar_cases, no_medal_participation_counts], ignore_index=True)
    
    # 特征和标签
    X = combined_data[['ParticipationCount', 'AttemptsBeforeFirstMedal', 'ParticipatedInFirstMedalEventAfter2000']]
    y = combined_data['WonMedal']
    
    return X, y, no_medal_participation_counts

# 1. 筛选之前未获得奖牌但在2000年后首次获得奖牌的国家的数据
def find_countries_with_first_medal_after_2000(athlete_data):
    all_countries = athlete_data['NOC'].unique()
    first_medal_events = {}
    
    for noc in all_countries:
        noc_data = athlete_data[athlete_data['NOC'] == noc]
        medal_data = noc_data[noc_data['Medal'] != 'No medal']
        
        if len(medal_data) > 0:
            first_medal_year = medal_data['Year'].min()
            if first_medal_year >= 2000:
                first_medal_events_this_year = medal_data[medal_data['Year'] == first_medal_year]['Event'].tolist()
                first_medal_events[noc] = {
                    'FirstMedalYear': first_medal_year,
                    'Events': first_medal_events_this_year
                }
    
    return first_medal_events

# 2. 筛选从未获得过奖牌的国家的数据
def filter_no_medal_countries(data, no_medal_countries):
    filtered_data = data[data['NOC'].isin(no_medal_countries)]
    participation_counts = filtered_data.groupby('NOC').size().reset_index(name='ParticipationCount')
    participation_counts['AttemptsBeforeFirstMedal'] = participation_counts['ParticipationCount']
    participation_counts['WonMedal'] = 0
    
    # 确保保留 'Event' 列
    participation_counts = participation_counts.merge(filtered_data[['NOC', 'Event']].drop_duplicates(), on='NOC', how='left')
    
    return participation_counts

# 3. 检查未获得奖牌的国家是否参与了2000年后首次获得奖牌的国家的项目
def add_participation_feature(filtered_data, first_medal_events):
    # 初始化新特征列
    filtered_data['ParticipatedInFirstMedalEventAfter2000'] = 0
    
    for noc in filtered_data['NOC'].unique():
        noc_data = filtered_data[filtered_data['NOC'] == noc]
        events_participated = set(noc_data['Event'])
        
        # 检查是否有参与2000年后首次获得奖牌的国家的项目
        for country, data in first_medal_events.items():
            events_won = set(data['Events'])
            if not events_participated.isdisjoint(events_won):  # 如果有交集
                filtered_data.loc[filtered_data['NOC'] == noc, 'ParticipatedInFirstMedalEventAfter2000'] = 1
                break
    
    return filtered_data

# 4. 筛选之前有相似情况但已经获得过奖牌的国家的数据
def find_similar_cases(data, no_medal_countries):
    medal_countries = data[~data['NOC'].isin(no_medal_countries) & (data['Medal'] != 'No medal')]
    
    first_medal_attempts = []
    for noc in medal_countries['NOC'].unique():
        noc_data = data[data['NOC'] == noc]
        first_medal_year = noc_data[noc_data['Medal'] != 'No medal']['Year'].min()
        if pd.isna(first_medal_year):
            continue  # 如果该国家没有获得过奖牌，跳过
        attempts_before_first_medal = len(noc_data[noc_data['Year'] < first_medal_year])
        first_medal_attempts.append({
            'NOC': noc,
            'ParticipationCount': attempts_before_first_medal,
            'AttemptsBeforeFirstMedal': attempts_before_first_medal,
            'WonMedal': 1  # 标记为已获得奖牌
        })
    
    similar_cases = pd.DataFrame(first_medal_attempts)
    return similar_cases

# 主函数
def main(athlete_data, no_medal_countries):
    # 预处理数据
    X, y, no_medal_participation_counts = preprocess_data(athlete_data, no_medal_countries)
    
    if X is None or y is None:
        return
    
    # 填补缺失值
    imputer = SimpleImputer(strategy='mean')  # 使用均值填补缺失值
    X_imputed = imputer.fit_transform(X)
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
    
    # 使用SMOTE进行过采样以平衡数据集
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    
    # 使用随机森林进行建模，并设置 class_weight 参数
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
    rf_model.fit(X_train_balanced, y_train_balanced)
    
    # 使用CalibratedClassifierCV进行概率校准
    calibrated_rf_model = CalibratedClassifierCV(estimator=rf_model, cv=5)
    calibrated_rf_model.fit(X_train_balanced, y_train_balanced)
    
    # 预测
    y_pred = calibrated_rf_model.predict(X_test)
    y_pred_proba = calibrated_rf_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    
    # 显示分类报告
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # 计算AUC-ROC分数
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC: {auc_roc:.4f}")
    
    # 预测从未获得过奖牌的国家在2028年获得第一枚奖牌的概率
    no_medal_predictions = calibrated_rf_model.predict_proba(
        imputer.transform(no_medal_participation_counts[['ParticipationCount', 'AttemptsBeforeFirstMedal', 'ParticipatedInFirstMedalEventAfter2000']])
    )[:, 1]
    
    # 打印每个国家的概率
    print("\nProbability of winning the first medal in 2028 for countries that have never won a medal:")
    unique_no_medal_predictions = np.unique(no_medal_predictions)
    unique_no_medal_participation_counts = no_medal_participation_counts.drop_duplicates(subset=['NOC'])
    for index, prob in enumerate(unique_no_medal_predictions):
        noc = unique_no_medal_participation_counts.iloc[index]['NOC']
        print(f"{noc}: Probability of winning the first medal in 2028: {prob*100:.6f}%")

main(athlete_data, no_medal_countries)



Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       473
           1       1.00      0.96      0.98        24

    accuracy                           1.00       497
   macro avg       1.00      0.98      0.99       497
weighted avg       1.00      1.00      1.00       497

AUC-ROC: 1.0000

Probability of winning the first medal in 2028 for countries that have never won a medal:
AND: Probability of winning the first medal in 2028: 0.263473%
ANG: Probability of winning the first medal in 2028: 0.263540%
ANT: Probability of winning the first medal in 2028: 0.263660%
ARU: Probability of winning the first medal in 2028: 0.263845%
ASA: Probability of winning the first medal in 2028: 0.263912%
BAN: Probability of winning the first medal in 2028: 0.265666%
BEN: Probability of winning the first medal in 2028: 0.265982%
BHU: Probability of winning the first medal in 2028: 0.266690%
BIH: Probability of winning the first m

In [21]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import numpy as np

# 数据预处理函数
def preprocess_data(athlete_data, no_medal_countries):
    # 获取2000年后首次获得奖牌的国家及其参赛项目
    first_medal_events = find_countries_with_first_medal_after_2000(athlete_data)
    
    # 筛选从未获得过奖牌的国家的数据
    no_medal_participation_counts = filter_no_medal_countries(athlete_data, no_medal_countries)
    
    # 添加新特征：是否参与了2000年后首次获得奖牌的国家的项目
    no_medal_participation_counts = add_participation_feature(no_medal_participation_counts, first_medal_events)
    
    # 筛选之前有相似情况但已经获得过奖牌的国家的数据
    similar_cases = find_similar_cases(athlete_data, no_medal_countries)
    
    if similar_cases.empty:
        print("No similar cases found with medals. Cannot fit the model.")
        return None, None
    
    # 合并未获得奖牌的国家数据和已获得奖牌的国家数据
    combined_data = pd.concat([similar_cases, no_medal_participation_counts], ignore_index=True)
    
    # 特征和标签
    X = combined_data[['ParticipationCount', 'AttemptsBeforeFirstMedal', 'ParticipatedInFirstMedalEventAfter2000']]
    y = combined_data['WonMedal']
    
    return X, y, no_medal_participation_counts

# 1. 筛选之前未获得奖牌但在2000年后首次获得奖牌的国家的数据
def find_countries_with_first_medal_after_2000(athlete_data):
    all_countries = athlete_data['NOC'].unique()
    first_medal_events = {}
    
    for noc in all_countries:
        noc_data = athlete_data[athlete_data['NOC'] == noc]
        medal_data = noc_data[noc_data['Medal'] != 'No medal']
        
        if len(medal_data) > 0:
            first_medal_year = medal_data['Year'].min()
            if first_medal_year >= 2000:
                first_medal_events_this_year = medal_data[medal_data['Year'] == first_medal_year]['Event'].tolist()
                first_medal_events[noc] = {
                    'FirstMedalYear': first_medal_year,
                    'Events': first_medal_events_this_year
                }
    
    return first_medal_events

# 2. 筛选从未获得过奖牌的国家的数据
def filter_no_medal_countries(data, no_medal_countries):
    filtered_data = data[data['NOC'].isin(no_medal_countries)]
    participation_counts = filtered_data.groupby('NOC').size().reset_index(name='ParticipationCount')
    participation_counts['AttemptsBeforeFirstMedal'] = participation_counts['ParticipationCount']
    participation_counts['WonMedal'] = 0
    
    # 确保保留 'Event' 列
    participation_counts = participation_counts.merge(filtered_data[['NOC', 'Event']].drop_duplicates(), on='NOC', how='left')
    
    return participation_counts

# 3. 检查未获得奖牌的国家是否参与了2000年后首次获得奖牌的国家的项目
def add_participation_feature(filtered_data, first_medal_events):
    # 初始化新特征列
    filtered_data['ParticipatedInFirstMedalEventAfter2000'] = 0
    
    for noc in filtered_data['NOC'].unique():
        noc_data = filtered_data[filtered_data['NOC'] == noc]
        events_participated = set(noc_data['Event'])
        
        # 检查是否有参与2000年后首次获得奖牌的国家的项目
        for country, data in first_medal_events.items():
            events_won = set(data['Events'])
            if not events_participated.isdisjoint(events_won):  # 如果有交集
                filtered_data.loc[filtered_data['NOC'] == noc, 'ParticipatedInFirstMedalEventAfter2000'] = 1
                break
    
    return filtered_data

# 4. 筛选之前有相似情况但已经获得过奖牌的国家的数据
def find_similar_cases(data, no_medal_countries):
    medal_countries = data[~data['NOC'].isin(no_medal_countries) & (data['Medal'] != 'No medal')]
    
    first_medal_attempts = []
    for noc in medal_countries['NOC'].unique():
        noc_data = data[data['NOC'] == noc]
        first_medal_year = noc_data[noc_data['Medal'] != 'No medal']['Year'].min()
        if pd.isna(first_medal_year):
            continue  # 如果该国家没有获得过奖牌，跳过
        attempts_before_first_medal = len(noc_data[noc_data['Year'] < first_medal_year])
        first_medal_attempts.append({
            'NOC': noc,
            'ParticipationCount': attempts_before_first_medal,
            'AttemptsBeforeFirstMedal': attempts_before_first_medal,
            'WonMedal': 1  # 标记为已获得奖牌
        })
    
    similar_cases = pd.DataFrame(first_medal_attempts)
    return similar_cases

# 主函数
def main(athlete_data, no_medal_countries):
    # 预处理数据
    X, y, no_medal_participation_counts = preprocess_data(athlete_data, no_medal_countries)
    
    if X is None or y is None:
        return
    
    # 填补缺失值
    imputer = SimpleImputer(strategy='mean')  # 使用均值填补缺失值
    X_imputed = imputer.fit_transform(X)
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
    
    # 使用SMOTE进行过采样以平衡数据集
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    
    # 使用随机森林进行建模，并设置 class_weight 参数
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
    rf_model.fit(X_train_balanced, y_train_balanced)
    
    # 使用CalibratedClassifierCV进行概率校准
    calibrated_rf_model = CalibratedClassifierCV(estimator=rf_model, cv=5)
    calibrated_rf_model.fit(X_train_balanced, y_train_balanced)
    
    # 获取特征重要性评分
    importances = rf_model.feature_importances_

    # 打印特征重要性评分
    print("Feature Importance Analysis:")
    for feature, importance in zip(X.columns, importances):
        print(f"Feature: {feature}, Importance: {importance}")
    
    # 预测
    y_pred = calibrated_rf_model.predict(X_test)
    y_pred_proba = calibrated_rf_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    
    # 显示分类报告
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # 计算AUC-ROC分数
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC: {auc_roc:.4f}")
    
    # 预测从未获得过奖牌的国家在2028年获得第一枚奖牌的概率
    no_medal_predictions = calibrated_rf_model.predict_proba(
        imputer.transform(no_medal_participation_counts[['ParticipationCount', 'AttemptsBeforeFirstMedal', 'ParticipatedInFirstMedalEventAfter2000']])
    )[:, 1]
    
    # 打印每个国家的概率
    print("\nProbability of winning the first medal in 2028 for countries that have never won a medal:")
    unique_no_medal_predictions = np.unique(no_medal_predictions)
    unique_no_medal_participation_counts = no_medal_participation_counts.drop_duplicates(subset=['NOC'])
    for index, prob in enumerate(unique_no_medal_predictions):
        noc = unique_no_medal_participation_counts.iloc[index]['NOC']
        print(f"{noc}: Probability of winning the first medal in 2028: {prob*100:.6f}%")

# 调用主函数
main(athlete_data, no_medal_countries)


Feature Importance Analysis:
Feature: ParticipationCount, Importance: 0.20187816571060035
Feature: AttemptsBeforeFirstMedal, Importance: 0.20135544420173615
Feature: ParticipatedInFirstMedalEventAfter2000, Importance: 0.5967663900876636
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       473
           1       1.00      0.96      0.98        24

    accuracy                           1.00       497
   macro avg       1.00      0.98      0.99       497
weighted avg       1.00      1.00      1.00       497

AUC-ROC: 1.0000

Probability of winning the first medal in 2028 for countries that have never won a medal:
AND: Probability of winning the first medal in 2028: 0.263473%
ANG: Probability of winning the first medal in 2028: 0.263540%
ANT: Probability of winning the first medal in 2028: 0.263660%
ARU: Probability of winning the first medal in 2028: 0.263845%
ASA: Probability of winning the first medal in 2028: 0

In [22]:
import pandas as pd
import numpy as np
from scipy.stats import nbinom


# 1. 筛选从未获得过奖牌的国家的数据
def filter_no_medal_countries(data, no_medal_countries):
    filtered_data = data[data['NOC'].isin(no_medal_countries)]
    # 计算每个国家参加奥运会的次数
    participation_counts = filtered_data.groupby('NOC').size().reset_index(name='ParticipationCount')
    return participation_counts

# 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
def find_similar_cases(data, no_medal_countries):
    # 过滤出已经获得过奖牌的国家
    medal_countries = data[~data['NOC'].isin(no_medal_countries) & (data['Medal'] != 'No medal')]
    
    # 计算每个国家从首次参赛到首次获得奖牌的尝试次数
    first_medal_attempts = []
    for noc in medal_countries['NOC'].unique():
        noc_data = data[data['NOC'] == noc]
        first_medal_year = noc_data[noc_data['Medal'] != 'No medal']['Year'].min()
        if pd.isna(first_medal_year):
            continue  # 如果该国家没有获得过奖牌，跳过
        attempts_before_first_medal = len(noc_data[noc_data['Year'] < first_medal_year])
        first_medal_attempts.append({
            'NOC': noc,
            'AttemptsBeforeFirstMedal': attempts_before_first_medal
        })
    
    similar_cases = pd.DataFrame(first_medal_attempts)
    return similar_cases

# 3. 使用负二项分布模型进行建模和预测
def fit_negative_binomial_model(similar_cases):
    # 提取尝试次数作为训练数据
    if 'AttemptsBeforeFirstMedal' not in similar_cases.columns:
        raise ValueError("Column 'AttemptsBeforeFirstMedal' not found in similar_cases DataFrame")
    attempts = similar_cases['AttemptsBeforeFirstMedal'].values
    
    # 估计负二项分布的参数
    # 使用方法 of moments 来估计参数 r 和 p
    mean_attempts = np.mean(attempts)
    var_attempts = np.var(attempts)
    
    r = (mean_attempts ** 2) / (var_attempts - mean_attempts)
    p = mean_attempts / var_attempts
    
    return r, p

def predict_probabilities(participation_counts, r, p):
    probabilities = {}
    for index, row in participation_counts.iterrows():
        noc = row['NOC']
        participation_count = row['ParticipationCount']
        prob = nbinom.pmf(participation_count, r, p)
        probabilities[noc] = prob
    return probabilities

# 主函数
def main(athlete_data, no_medal_countries):
    # 1. 筛选从未获得过奖牌的国家的数据
    no_medal_participation_counts = filter_no_medal_countries(athlete_data, no_medal_countries)
    
    # 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
    similar_cases = find_similar_cases(athlete_data, no_medal_countries)
    
    # 检查 similar_cases 是否为空
    if similar_cases.empty:
        print("No similar cases found with medals. Cannot fit the negative binomial model.")
        return
    
    # 3. 使用负二项分布模型进行建模
    try:
        r, p = fit_negative_binomial_model(similar_cases)
    except ValueError as e:
        print(e)
        return
    
    # 4. 预测从未获得过奖牌的国家在2028年获得第一枚奖牌的概率
    probabilities = predict_probabilities(no_medal_participation_counts, r, p)
    
    # 显示结果
    for noc, prob in probabilities.items():
        print(f"{noc}: Probability of winning the first medal in 2028: {prob:.4f}")

# 调用主函数
main(athlete_data, no_medal_countries)

AND: Probability of winning the first medal in 2028: 0.0037
ANG: Probability of winning the first medal in 2028: 0.0007
ANT: Probability of winning the first medal in 2028: 0.0011
ARU: Probability of winning the first medal in 2028: 0.0040
ASA: Probability of winning the first medal in 2028: 0.0051
BAN: Probability of winning the first medal in 2028: 0.0029
BEN: Probability of winning the first medal in 2028: 0.0021
BHU: Probability of winning the first medal in 2028: 0.0080
BIH: Probability of winning the first medal in 2028: 0.0017
BIZ: Probability of winning the first medal in 2028: 0.0030
BOL: Probability of winning the first medal in 2028: 0.0008
BRU: Probability of winning the first medal in 2028: 0.0135
CAF: Probability of winning the first medal in 2028: 0.0033
CAM: Probability of winning the first medal in 2028: 0.0030
CAY: Probability of winning the first medal in 2028: 0.0023
CGO: Probability of winning the first medal in 2028: 0.0018
CHA: Probability of winning the first me

In [14]:
# Random Forest Model
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. 筛选从未获得过奖牌的国家的数据
def filter_no_medal_countries(data, no_medal_countries):
    filtered_data = data[data['NOC'].isin(no_medal_countries)]
    # 计算每个国家参加奥运会的次数
    participation_counts = filtered_data.groupby('NOC').size().reset_index(name='ParticipationCount')
    # 假设这些国家的 AttemptsBeforeFirstMedal 等于 ParticipationCount
    participation_counts['AttemptsBeforeFirstMedal'] = participation_counts['ParticipationCount']
    return participation_counts

# 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
def find_similar_cases(data, no_medal_countries):
    # 过滤出已经获得过奖牌的国家
    medal_countries = data[~data['NOC'].isin(no_medal_countries) & (data['Medal'] != 'No medal')]
    
    # 计算每个国家从首次参赛到首次获得奖牌的尝试次数
    first_medal_attempts = []
    for noc in medal_countries['NOC'].unique():
        noc_data = data[data['NOC'] == noc]
        first_medal_year = noc_data[noc_data['Medal'] != 'No medal']['Year'].min()
        if pd.isna(first_medal_year):
            continue  # 如果该国家没有获得过奖牌，跳过
        attempts_before_first_medal = len(noc_data[noc_data['Year'] < first_medal_year])
        first_medal_attempts.append({
            'NOC': noc,
            'AttemptsBeforeFirstMedal': attempts_before_first_medal,
            'WonMedal': 1  # 标记为已获得奖牌
        })
    
    similar_cases = pd.DataFrame(first_medal_attempts)
    return similar_cases

# 主函数
def main(athlete_data, no_medal_countries):
    # 1. 筛选从未获得过奖牌的国家的数据
    no_medal_participation_counts = filter_no_medal_countries(athlete_data, no_medal_countries)
    
    # 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
    similar_cases = find_similar_cases(athlete_data, no_medal_countries)
    
    # 检查 similar_cases 是否为空
    if similar_cases.empty:
        print("No similar cases found with medals. Cannot fit the model.")
        return
    
    # 添加未获得奖牌的国家数据
    no_medal_participation_counts['WonMedal'] = 0
    combined_data = pd.concat([similar_cases, no_medal_participation_counts], ignore_index=True)
    
    # 特征和标签
    X = combined_data[['ParticipationCount', 'AttemptsBeforeFirstMedal']]
    y = combined_data['WonMedal']
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 使用随机森林进行建模
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # 预测
    y_pred = rf_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    
    # 显示结果
    print(classification_report(y_test, rf_model.predict(X_test)))
    
    # 预测从未获得过奖牌的国家在2028年获得第一枚奖牌的概率
    no_medal_predictions = rf_model.predict_proba(no_medal_participation_counts[['ParticipationCount', 'AttemptsBeforeFirstMedal']])[:, 1]
    for index, prob in enumerate(no_medal_predictions):
        noc = no_medal_participation_counts.iloc[index]['NOC']
        print(f"{noc}: Probability of winning the first medal in 2028: {prob:.4f}")

# 调用主函数
main(athlete_data, no_medal_countries)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        29

    accuracy                           1.00        44
   macro avg       1.00      1.00      1.00        44
weighted avg       1.00      1.00      1.00        44

AND: Probability of winning the first medal in 2028: 0.0133
ANG: Probability of winning the first medal in 2028: 0.0300
ANT: Probability of winning the first medal in 2028: 0.0100
ARU: Probability of winning the first medal in 2028: 0.0100
ASA: Probability of winning the first medal in 2028: 0.0100
BAN: Probability of winning the first medal in 2028: 0.0100
BEN: Probability of winning the first medal in 2028: 0.0000
BHU: Probability of winning the first medal in 2028: 0.0000
BIH: Probability of winning the first medal in 2028: 0.0000
BIZ: Probability of winning the first medal in 2028: 0.0100
BOL: Probability of winning the first medal in 2028: 0.0000
BRU: Probabil

In [15]:
# Random Forest Model with Evaluation 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# 1. 筛选从未获得过奖牌的国家的数据
def filter_no_medal_countries(data, no_medal_countries):
    filtered_data = data[data['NOC'].isin(no_medal_countries)]
    # 计算每个国家参加奥运会的次数
    participation_counts = filtered_data.groupby('NOC').size().reset_index(name='ParticipationCount')
    # 假设这些国家的 AttemptsBeforeFirstMedal 等于 ParticipationCount
    participation_counts['AttemptsBeforeFirstMedal'] = participation_counts['ParticipationCount']
    # 标记为未获得奖牌
    participation_counts['WonMedal'] = 0
    return participation_counts

# 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
def find_similar_cases(data, no_medal_countries):
    # 过滤出已经获得过奖牌的国家
    medal_countries = data[~data['NOC'].isin(no_medal_countries) & (data['Medal'] != 'No medal')]
    
    # 计算每个国家从首次参赛到首次获得奖牌的尝试次数
    first_medal_attempts = []
    for noc in medal_countries['NOC'].unique():
        noc_data = data[data['NOC'] == noc]
        first_medal_year = noc_data[noc_data['Medal'] != 'No medal']['Year'].min()
        if pd.isna(first_medal_year):
            continue  # 如果该国家没有获得过奖牌，跳过
        attempts_before_first_medal = len(noc_data[noc_data['Year'] < first_medal_year])
        first_medal_attempts.append({
            'NOC': noc,
            'ParticipationCount': attempts_before_first_medal,
            'AttemptsBeforeFirstMedal': attempts_before_first_medal,
            'WonMedal': 1  # 标记为已获得奖牌
        })
    
    similar_cases = pd.DataFrame(first_medal_attempts)
    return similar_cases

# 主函数
def main(athlete_data, no_medal_countries):
    # 1. 筛选从未获得过奖牌的国家的数据
    no_medal_participation_counts = filter_no_medal_countries(athlete_data, no_medal_countries)
    
    # 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
    similar_cases = find_similar_cases(athlete_data, no_medal_countries)
    
    # 检查 similar_cases 是否为空
    if similar_cases.empty:
        print("No similar cases found with medals. Cannot fit the model.")
        return
    
    # 合并未获得奖牌的国家数据和已获得奖牌的国家数据
    combined_data = pd.concat([similar_cases, no_medal_participation_counts], ignore_index=True)
    
    # 特征和标签
    X = combined_data[['ParticipationCount', 'AttemptsBeforeFirstMedal']]
    y = combined_data['WonMedal']
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 使用随机森林进行建模
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # 预测
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    
    # 显示分类报告
    print(classification_report(y_test, y_pred))
    
    # 计算AUC-ROC分数
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC: {auc_roc:.4f}")
    
    # 预测从未获得过奖牌的国家在2028年获得第一枚奖牌的概率
    no_medal_predictions = rf_model.predict_proba(no_medal_participation_counts[['ParticipationCount', 'AttemptsBeforeFirstMedal']])[:, 1]
    for index, prob in enumerate(no_medal_predictions):
        noc = no_medal_participation_counts.iloc[index]['NOC']
        print(f"{noc}: Probability of winning the first medal in 2028: {prob:.4f}")

main(athlete_data, no_medal_countries)

              precision    recall  f1-score   support

           0       0.44      0.73      0.55        15
           1       0.79      0.52      0.62        29

    accuracy                           0.59        44
   macro avg       0.61      0.63      0.59        44
weighted avg       0.67      0.59      0.60        44

AUC-ROC: 0.6678
AND: Probability of winning the first medal in 2028: 0.2508
ANG: Probability of winning the first medal in 2028: 0.2615
ANT: Probability of winning the first medal in 2028: 0.2900
ARU: Probability of winning the first medal in 2028: 0.4551
ASA: Probability of winning the first medal in 2028: 0.4337
BAN: Probability of winning the first medal in 2028: 0.1575
BEN: Probability of winning the first medal in 2028: 0.0400
BHU: Probability of winning the first medal in 2028: 0.0800
BIH: Probability of winning the first medal in 2028: 0.0000
BIZ: Probability of winning the first medal in 2028: 0.3820
BOL: Probability of winning the first medal in 2028: 0.51

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE

# 1. 筛选从未获得过奖牌的国家的数据
def filter_no_medal_countries(data, no_medal_countries):
    filtered_data = data[data['NOC'].isin(no_medal_countries)]
    # 计算每个国家参加奥运会的次数
    participation_counts = filtered_data.groupby('NOC').size().reset_index(name='ParticipationCount')
    # 假设这些国家的 AttemptsBeforeFirstMedal 等于 ParticipationCount
    participation_counts['AttemptsBeforeFirstMedal'] = participation_counts['ParticipationCount']
    # 标记为未获得奖牌
    participation_counts['WonMedal'] = 0
    return participation_counts

# 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
def find_similar_cases(data, no_medal_countries):
    # 过滤出已经获得过奖牌的国家
    medal_countries = data[~data['NOC'].isin(no_medal_countries) & (data['Medal'] != 'No medal')]
    
    # 计算每个国家从首次参赛到首次获得奖牌的尝试次数
    first_medal_attempts = []
    for noc in medal_countries['NOC'].unique():
        noc_data = data[data['NOC'] == noc]
        first_medal_year = noc_data[noc_data['Medal'] != 'No medal']['Year'].min()
        if pd.isna(first_medal_year):
            continue  # 如果该国家没有获得过奖牌，跳过
        attempts_before_first_medal = len(noc_data[noc_data['Year'] < first_medal_year])
        first_medal_attempts.append({
            'NOC': noc,
            'ParticipationCount': attempts_before_first_medal,
            'AttemptsBeforeFirstMedal': attempts_before_first_medal,
            'WonMedal': 1  # 标记为已获得奖牌
        })
    
    similar_cases = pd.DataFrame(first_medal_attempts)
    return similar_cases

# 主函数
def main(athlete_data, no_medal_countries):
    # 1. 筛选从未获得过奖牌的国家的数据
    no_medal_participation_counts = filter_no_medal_countries(athlete_data, no_medal_countries)
    
    # 2. 筛选之前有相似情况但已经获得过奖牌的国家的数据
    similar_cases = find_similar_cases(athlete_data, no_medal_countries)
    
    # 检查 similar_cases 是否为空
    if similar_cases.empty:
        print("No similar cases found with medals. Cannot fit the model.")
        return
    
    # 合并未获得奖牌的国家数据和已获得奖牌的国家数据
    combined_data = pd.concat([similar_cases, no_medal_participation_counts], ignore_index=True)
    
    # 特征和标签
    X = combined_data[['ParticipationCount', 'AttemptsBeforeFirstMedal']]
    y = combined_data['WonMedal']
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 使用SMOTE进行过采样以平衡数据集
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    
    # 使用随机森林进行建模，并设置 class_weight 参数
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
    rf_model.fit(X_train_balanced, y_train_balanced)
    
    # 使用CalibratedClassifierCV进行概率校准
    calibrated_rf_model = CalibratedClassifierCV(estimator=rf_model, cv=5)
    calibrated_rf_model.fit(X_train_balanced, y_train_balanced)
    
    # 预测
    y_pred = calibrated_rf_model.predict(X_test)
    y_pred_proba = calibrated_rf_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    
    # 显示分类报告
    print(classification_report(y_test, y_pred))
    
    # 计算AUC-ROC分数
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC: {auc_roc:.4f}")
    
    # 预测从未获得过奖牌的国家在2028年获得第一枚奖牌的概率
    no_medal_predictions = calibrated_rf_model.predict_proba(no_medal_participation_counts[['ParticipationCount', 'AttemptsBeforeFirstMedal']])[:, 1]
    for index, prob in enumerate(no_medal_predictions):
        noc = no_medal_participation_counts.iloc[index]['NOC']
        print(f"{noc}: Probability of winning the first medal in 2028: {prob:.4f}")


main(athlete_data, no_medal_countries)

              precision    recall  f1-score   support

           0       0.41      0.73      0.52        15
           1       0.76      0.45      0.57        29

    accuracy                           0.55        44
   macro avg       0.59      0.59      0.54        44
weighted avg       0.64      0.55      0.55        44

AUC-ROC: 0.7207
AND: Probability of winning the first medal in 2028: 0.2416
ANG: Probability of winning the first medal in 2028: 0.3059
ANT: Probability of winning the first medal in 2028: 0.1700
ARU: Probability of winning the first medal in 2028: 0.4087
ASA: Probability of winning the first medal in 2028: 0.2761
BAN: Probability of winning the first medal in 2028: 0.2171
BEN: Probability of winning the first medal in 2028: 0.1647
BHU: Probability of winning the first medal in 2028: 0.1723
BIH: Probability of winning the first medal in 2028: 0.1618
BIZ: Probability of winning the first medal in 2028: 0.2880
BOL: Probability of winning the first medal in 2028: 0.44

In [17]:
import pandas as pd

# 1. 筛选之前未获得奖牌但在2000年后首次获得奖牌的国家的数据
def find_countries_with_first_medal_after_2000(athlete_data):
    # 获取所有国家列表
    all_countries = athlete_data['NOC'].unique()
    
    # 初始化一个字典来存储结果
    first_medal_events = {}
    
    for noc in all_countries:
        noc_data = athlete_data[athlete_data['NOC'] == noc]
        
        # 过滤出获得奖牌的数据
        medal_data = noc_data[noc_data['Medal'] != 'No medal']
        
        if len(medal_data) > 0:  # 如果该国家有获得奖牌的记录
            # 找到首次获得奖牌的年份
            first_medal_year = medal_data['Year'].min()
            
            # 只考虑首次获得奖牌的年份在2000年或之后的国家
            if first_medal_year >= 2000:
                # 获取首次获得奖牌的项目
                first_medal_events_this_year = medal_data[medal_data['Year'] == first_medal_year]['Event'].tolist()
                
                # 存储结果
                first_medal_events[noc] = {
                    'FirstMedalYear': first_medal_year,
                    'Events': first_medal_events_this_year
                }
    
    return first_medal_events

# 主函数
def main(athlete_data):
    # 获取首次获得奖牌的国家及其项目的统计数据
    first_medal_events = find_countries_with_first_medal_after_2000(athlete_data)
    
    # 输出结果
    for noc, data in first_medal_events.items():
        print(f"Country: {noc}")
        print(f"First Medal Year: {data['FirstMedalYear']}")
        print("Events where medals were won:")
        for event in data['Events']:
            print(f"- {event}")
        print("\n")


main(athlete_data)

Country: SUD
First Medal Year: 2008
Events where medals were won:
- Athletics Men's 800 metres


Country: KUW
First Medal Year: 2000
Events where medals were won:
- Shooting Men's Double Trap


Country: BRN
First Medal Year: 2012
Events where medals were won:
- Athletics Women's 1,500 metres


Country: ERI
First Medal Year: 2004
Events where medals were won:
- Athletics Men's 10,000 metres


Country: JOR
First Medal Year: 2016
Events where medals were won:
- Taekwondo Men's Featherweight


Country: KSA
First Medal Year: 2000
Events where medals were won:
- Equestrianism Mixed Jumping, Individual
- Athletics Men's 400 metres Hurdles


Country: UAE
First Medal Year: 2004
Events where medals were won:
- Shooting Men's Double Trap


Country: KGZ
First Medal Year: 2000
Events where medals were won:
- Judo Men's Extra-Lightweight


Country: TJK
First Medal Year: 2008
Events where medals were won:
- Wrestling Men's Light-Heavyweight, Freestyle
- Judo Men's Lightweight


Country: AFG
First Med

In [9]:
print("Number of no medal countries:", len(no_medal_countries))
print("First few rows of athlete data:")
print(athlete_data.head())

Number of no medal countries: 77
First few rows of athlete data:
                    Name Sex            Team  NOC  Year         City  \
0              A Dijiang   M           China  CHN  1992    Barcelona   
1               A Lamusi   M           China  CHN  2012       London   
2            Gunnar Aaby   M         Denmark  DEN  1920    Antwerpen   
3            Edgar Aabye   M  Denmark/Sweden  DEN  1900        Paris   
4  Cornelia (-strannood)   F     Netherlands  NED  1932  Los Angeles   

        Sport                         Event     Medal  
0  Basketball   Basketball Men's Basketball  No medal  
1        Judo  Judo Men's Extra-Lightweight  No medal  
2    Football       Football Men's Football  No medal  
3  Tug-Of-War   Tug-Of-War Men's Tug-Of-War      Gold  
4   Athletics  Athletics Women's 100 metres  No medal  


In [65]:
no_medal_countries = pd.read_csv('No_Medal_Countries.csv') 
no_medal_countries[no_medal_countries['0'] == 0].loc[:, 'NOC']

5      AND
6      ANG
7      ANT
11     ARU
12     ASA
      ... 
225    VIN
226    VNM
228    YAR
229    YEM
230    YMD
Name: NOC, Length: 77, dtype: object