In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split,RandomizedSearchCV,StratifiedKFold
import lightgbm as lgb
from scipy.stats import uniform,randint
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレストによる分類器

In [61]:
df = pd.read_csv("C:/Users/sngk2/OneDrive/デスクトップ/GCI/conpe2/train (1).csv")
df_test = pd.read_csv("C:/Users/sngk2/OneDrive/デスクトップ/GCI/conpe2/test (1).csv")
df.head()

Unnamed: 0,Id,Year,Age,School,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,Player_Type,Position_Type,Position,Drafted
0,0,2011,21.0,Lehigh,1.905,140.160042,5.39,59.69,29.0,251.46,7.91,4.94,offense,offensive_lineman,OG,1.0
1,1,2011,24.0,Abilene Christian,1.8288,87.089735,4.31,101.6,16.0,332.74,,,offense,backs_receivers,WR,1.0
2,2,2018,21.0,Colorado St.,1.8542,92.986436,4.51,91.44,10.0,309.88,6.95,4.37,offense,backs_receivers,WR,1.0
3,3,2010,21.0,East Carolina,1.9304,148.778297,5.09,76.2,39.0,254.0,8.12,4.71,defense,defensive_lineman,DT,1.0
4,4,2016,21.0,California,1.8796,92.079251,4.64,78.74,,281.94,7.13,4.2,offense,backs_receivers,WR,1.0


In [62]:
print(df.isnull().sum())

Id                    0
Year                  0
Age                 435
School                0
Height                0
Weight                0
Sprint_40yd         145
Vertical_Jump       554
Bench_Press_Reps    721
Broad_Jump          581
Agility_3cone       970
Shuttle             912
Player_Type           0
Position_Type         0
Position              0
Drafted               0
dtype: int64


In [63]:
label_encoders = {}
for c in ["Player_Type", "Position_Type", "Position"]:
    label_encoders[c] = LabelEncoder()
    df[c] = label_encoders[c].fit_transform(df[c].astype(str))
    df_test[c] = label_encoders[c].transform(df_test[c].astype(str))

In [64]:
# 平均で補完する対象の列
cols_to_fill = ['Age', 'Sprint_40yd', 'Vertical_Jump', 'Bench_Press_Reps',
                'Broad_Jump', 'Agility_3cone', 'Shuttle']

# train の平均で train/test 両方を補完
for col in cols_to_fill:
    mean_value = df[col].mean()
    df[col] = df[col].fillna(mean_value)
    df_test[col] = df_test[col].fillna(mean_value)
df['BMI'] = df['Weight'] / (df['Height'] ** 2)
df.drop('School',axis =1, inplace = True)
df.drop('Id',axis =1, inplace = True)
# df.drop('Player_Type',axis =1, inplace = True)
# df.drop('Position',axis = 1, inplace = True)
# df.drop('Position_Type',axis = 1, inplace = True)

print(df.head())
# df.fillna(df.mean(),inplace = True)
df_test['BMI'] = df_test['Weight'] / (df_test['Height'] ** 2)
df_test.drop('School',axis =1, inplace = True)
df_test.drop('Id',axis =1, inplace = True)
# df_test.drop('Player_Type',axis =1, inplace = True)
# df_test.drop('Position',axis = 1, inplace = True)
# df_test.drop('Position_Type',axis = 1, inplace = True)

# df_test.fillna(df.mean(),inplace = True)

print(df.isnull().sum())
print(df_test.isnull().sum())

   Year   Age  Height      Weight  Sprint_40yd  Vertical_Jump  \
0  2011  21.0  1.9050  140.160042         5.39          59.69   
1  2011  24.0  1.8288   87.089735         4.31         101.60   
2  2018  21.0  1.8542   92.986436         4.51          91.44   
3  2010  21.0  1.9304  148.778297         5.09          76.20   
4  2016  21.0  1.8796   92.079251         4.64          78.74   

   Bench_Press_Reps  Broad_Jump  Agility_3cone   Shuttle  Player_Type  \
0         29.000000      251.46       7.910000  4.940000            1   
1         16.000000      332.74       7.230447  4.399422            1   
2         10.000000      309.88       6.950000  4.370000            1   
3         39.000000      254.00       8.120000  4.710000            0   
4         20.236408      281.94       7.130000  4.200000            1   

   Position_Type  Position  Drafted        BMI  
0              5        10      1.0  38.621956  
1              0        19      1.0  26.039614  
2              0       

In [65]:
y = df["Drafted"]
X = df.drop("Drafted" , axis =1)

X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size = 0.3)



In [68]:
model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=5,          # RandomForest と合わせる
    learning_rate=0.1,    # デフォルト
    subsample=1.0,        # bagging_fraction の別名
    colsample_bytree=1.0, # feature_fraction の別名
    random_state=42
)
# ───────────────────────────────────────────────

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# スコア格納用
auc_scores = []

# Stratified K-Fold による学習と評価
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}")

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # モデル学習
    model.fit(X_train, y_train)

    # 予測とスコアリング
    lgbpred = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, lgbpred)
    auc_scores.append(auc)
    print(f"  AUC: {round(auc, 4)}")

# 平均AUCを表示
mean_auc = np.mean(auc_scores)
print("\nAverage Validation AUC:", round(mean_auc, 4))






# # 特徴量と目的変数に分ける
# X = df.drop(columns=["Drafted"])
# y = df["Drafted"]

# # モデルとCVの設定
# model = RandomForestClassifier(
#     n_estimators=100,
#     max_depth=5,
#     random_state=2025
# )
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # スコア格納用
# auc_scores = []

# # Stratified K-Fold による学習と評価
# for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
#     print(f"Fold {fold + 1}")

#     X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
#     y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

#     # モデル学習
#     model.fit(X_train, y_train)
    
#     # 予測とスコアリング
#     lgbpred= model.predict_proba(X_valid)[:, 1]
#     auc = roc_auc_score(y_valid, lgbpred)
#     auc_scores.append(auc)
#     print(f"  AUC: {round(auc, 4)}")

# # 平均AUCを表示
# mean_auc = np.mean(auc_scores)
# print("\nAverage Validation AUC:", round(mean_auc, 4))


Fold 1
  AUC: 0.8063
Fold 2
  AUC: 0.8326
Fold 3
  AUC: 0.8286
Fold 4
  AUC: 0.7859
Fold 5
  AUC: 0.8369

Average Validation AUC: 0.8181


In [69]:
lgb_pred = model.predict(df_test)
# このセルを実行すると提出用のCSVファイルが作成されます
submission = pd.read_csv('C:/Users/sngk2/OneDrive/デスクトップ/GCI/conpe2/sample_submission.csv') # PATHは必要に応じて変更の必要があります
submission['Drafted'] =lgb_pred

submission.to_csv('C:/Users/sngk2/OneDrive/デスクトップ/GCI/conpe2/baseline_submission.csv', index=False)