In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import MDS, TSNE

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

import seaborn as sns
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

sub_df = pd.read_csv("sample_submission.csv")

In [3]:
train_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,76518.0,,,,38258.5,22088.988286,0.0,19129.25,38258.5,57387.75,76517.0
Marital status,76518.0,,,,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,,,,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,,,,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,,,,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,,,,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,,,,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,,,,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,,,,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,,,,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [5]:
train_df.nunique().sort_values()

Gender                                                2
International                                         2
Scholarship holder                                    2
Tuition fees up to date                               2
Debtor                                                2
Daytime/evening attendance                            2
Educational special needs                             2
Displaced                                             2
Target                                                3
Marital status                                        6
Application order                                     8
Unemployment rate                                    11
Curricular units 2nd sem (without evaluations)       11
GDP                                                  11
Curricular units 1st sem (without evaluations)       12
Inflation rate                                       13
Nacionality                                          18
Course                                          

In [6]:
feature_list = [feature for feature in train_df.columns if  feature  != "Target"]

target = "Target"

binary_features = ['Scholarship holder','International','Gender','Tuition fees up to date',
                   'Daytime/evening attendance','Debtor','Educational special needs','Displaced']

#from data set description
categorical_features = ['Marital status', 'Unemployment rate', 'Curricular units 2nd sem (without evaluations)', 'GDP', 
                        'Curricular units 1st sem (without evaluations)', 'Inflation rate', 'Nacionality', 'Course',
                        'Curricular units 2nd sem (credited)', 'Previous qualification', 'Curricular units 2nd sem (approved)',
                        'Curricular units 1st sem (credited)', 'Application mode', 'Curricular units 2nd sem (enrolled)', 
                        'Curricular units 1st sem (approved)', 'Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
                        "Mother's qualification", 'Curricular units 1st sem (evaluations)', "Father's qualification", "Mother's occupation",
                        'Age at enrollment', "Father's occupation"]

train_df[categorical_features] = train_df[categorical_features].astype('category')
test_df[categorical_features] = test_df[categorical_features].astype('category')


numeric_features = list(set(feature_list) - set(binary_features)- set(categorical_features))

assert sorted(feature_list) == sorted(numeric_features + binary_features + categorical_features)

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

sc = StandardScaler()
oe = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', sc, numeric_features),
        ('cat', oe, categorical_features),
        ('bin', 'passthrough', binary_features)
    ])

In [8]:
# 前処理の実行
data_preprocessed = preprocessor.fit_transform(train_df)
test_preprocessed = preprocessor.transform(test_df)

# 疎行列を密行列に変換し、書き込み可能なコピーを作成
if hasattr(data_preprocessed, 'toarray'):
    data_preprocessed = data_preprocessed.toarray()
else:
    data_preprocessed = np.array(data_preprocessed, copy=True)

if hasattr(test_preprocessed, 'toarray'):
    test_preprocessed = test_preprocessed.toarray()
else:
    test_preprocessed = np.array(test_preprocessed, copy=True)



In [9]:
# OneHotEncoderによって生成された特徴量名を取得
encoded_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# 全ての特徴量名を結合
all_features = numeric_features + list(encoded_features) + binary_features

# 前処理されたデータをDataFrameに変換
df_preprocessed = pd.DataFrame(data_preprocessed, columns=all_features)
test_preprocessed = pd.DataFrame(test_preprocessed, columns=all_features)

In [10]:
X = df_preprocessed.drop('id', axis=1)
y = train_df.loc[:, 'Target']

label_mapping = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
encoded_y = y.map(label_mapping)

In [13]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, encoded_y, test_size=0.2, random_state=42)

# クラス重みの計算
class_counts = np.bincount(y_train)
class_weights = {i: 1.0 / count for i, count in enumerate(class_counts) if count > 0}

# パラメータ設定
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'is_unbalance': True  # クラス不均衡に対応
}

# LightGBMのデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train, weight=np.vectorize(class_weights.get)(y_train))
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# 早期停止用のコールバック設定
callbacks = [lgb.early_stopping(stopping_rounds=10)]

# モデルの訓練
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, callbacks=callbacks)

# 予測
y_pred = lgb_model.predict(X_val)
y_pred_max = [np.argmax(line) for line in y_pred]

# 評価
accuracy = accuracy_score(y_val, y_pred_max)
print(f'LightGBM Validation Accuracy: {accuracy}')
print(classification_report(y_val, y_pred_max))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1520
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 333
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[395]	training's multi_logloss: 0.405564	valid_1's multi_logloss: 0.453387
LightGBM Validation Accuracy: 0.8224647151071616
              precision    recall  f1-score   support

           0       0.91      0.82      0.86      5028
           1       0.58      0.74      0.65      3017
           2       0.89      0.86      0.88      7259

    accuracy                           0.82   

In [16]:
test_preprocessed = test_preprocessed.drop('id', axis=1)

# モデルの予測結果を取得
test_pred = lgb_model.predict(test_preprocessed)

In [18]:
# ターゲット値に変換
target_mapping = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}
final_predictions = [target_mapping[np.argmax(pred)] for pred in test_pred]

In [19]:
# 結果をデータフレームに格納
results_df = pd.DataFrame(data={"id":test_df.id, "Target": final_predictions})

print(results_df.head())

      id    Target
0  76518   Dropout
1  76519  Graduate
2  76520  Graduate
3  76521  Graduate
4  76522  Enrolled


In [20]:
# 結果を保存
results_df.to_csv('submission4.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
