In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import MDS, TSNE

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

import seaborn as sns
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train_df = pd.read_csv("../train.csv")
test_df = pd.read_csv("../test.csv")

sub_df = pd.read_csv("../sample_submission.csv")

In [3]:
train_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,76518.0,,,,38258.5,22088.988286,0.0,19129.25,38258.5,57387.75,76517.0
Marital status,76518.0,,,,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,,,,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,,,,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,,,,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,,,,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,,,,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,,,,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,,,,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,,,,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [5]:
train_df.nunique().sort_values()

Gender                                                2
International                                         2
Scholarship holder                                    2
Tuition fees up to date                               2
Debtor                                                2
Daytime/evening attendance                            2
Educational special needs                             2
Displaced                                             2
Target                                                3
Marital status                                        6
Application order                                     8
Unemployment rate                                    11
Curricular units 2nd sem (without evaluations)       11
GDP                                                  11
Curricular units 1st sem (without evaluations)       12
Inflation rate                                       13
Nacionality                                          18
Course                                          

In [6]:
feature_list = [feature for feature in train_df.columns if  feature  != "Target"]

target = "Target"

binary_features = ['Scholarship holder','International','Gender','Tuition fees up to date',
                   'Daytime/evening attendance','Debtor','Educational special needs','Displaced']

#from data set description
categorical_features = ['Marital status', 'Unemployment rate', 'Curricular units 2nd sem (without evaluations)', 'GDP', 
                        'Curricular units 1st sem (without evaluations)', 'Inflation rate', 'Nacionality', 'Course',
                        'Curricular units 2nd sem (credited)', 'Previous qualification', 'Curricular units 2nd sem (approved)',
                        'Curricular units 1st sem (credited)', 'Application mode', 'Curricular units 2nd sem (enrolled)', 
                        'Curricular units 1st sem (approved)', 'Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
                        "Mother's qualification", 'Curricular units 1st sem (evaluations)', "Father's qualification", "Mother's occupation",
                        'Age at enrollment', "Father's occupation"]

train_df[categorical_features] = train_df[categorical_features].astype('category')
test_df[categorical_features] = test_df[categorical_features].astype('category')


numeric_features = list(set(feature_list) - set(binary_features)- set(categorical_features))

assert sorted(feature_list) == sorted(numeric_features + binary_features + categorical_features)

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

sc = StandardScaler()
oe = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', sc, numeric_features),
        ('cat', oe, categorical_features),
        ('bin', 'passthrough', binary_features)
    ])

In [8]:
# 前処理の実行
data_preprocessed = preprocessor.fit_transform(train_df)
test_preprocessed = preprocessor.transform(test_df)

# 疎行列を密行列に変換し、書き込み可能なコピーを作成
if hasattr(data_preprocessed, 'toarray'):
    data_preprocessed = data_preprocessed.toarray()
else:
    data_preprocessed = np.array(data_preprocessed, copy=True)

if hasattr(test_preprocessed, 'toarray'):
    test_preprocessed = test_preprocessed.toarray()
else:
    test_preprocessed = np.array(test_preprocessed, copy=True)



In [9]:
# OneHotEncoderによって生成された特徴量名を取得
encoded_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# 全ての特徴量名を結合
all_features = numeric_features + list(encoded_features) + binary_features

# 前処理されたデータをDataFrameに変換
df_preprocessed = pd.DataFrame(data_preprocessed, columns=all_features)
test_preprocessed = pd.DataFrame(test_preprocessed, columns=all_features)

In [10]:
X = df_preprocessed.drop('id', axis=1)
y = train_df.loc[:, 'Target']

label_mapping = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
encoded_y = y.map(label_mapping)

In [20]:
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from lightgbm import LGBMClassifier
import numpy as np

# データの準備（例）
# data_preprocessed, encoded_y, test_preprocessed, test_df などを事前に用意する

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, encoded_y, test_size=0.2, random_state=42)

# SMOTEによるオーバーサンプリング
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# LightGBMのモデル定義
lgb_params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

lgb_model = LGBMClassifier(**lgb_params)
lgb_model.fit(X_train_smote, y_train_smote)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048857 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 44281
[LightGBM] [Info] Number of data points in the train set: 87069, number of used features: 350
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [12]:
# Kerasのニューラルネットワークモデル定義
def create_nn_model(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

nn_model = create_nn_model(X_train_smote.shape[1])

# アーリーストッピングのコールバック
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

nn_model.fit(X_train_smote, y_train_smote, epochs=20, batch_size=32, validation_data=(X_val, y_val), verbose=1, callbacks=[early_stopping])

2024-06-27 10:56:53.662974: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-06-27 10:56:53.663002: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-06-27 10:56:53.663016: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-06-27 10:56:53.663040: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-27 10:56:53.663063: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/20


2024-06-27 10:56:54.551897: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m2721/2721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 15ms/step - accuracy: 0.7389 - loss: 0.6419 - val_accuracy: 0.8181 - val_loss: 0.4766
Epoch 2/20
[1m2721/2721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - accuracy: 0.8013 - loss: 0.5241 - val_accuracy: 0.8172 - val_loss: 0.4740
Epoch 3/20
[1m2721/2721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - accuracy: 0.8052 - loss: 0.5078 - val_accuracy: 0.8157 - val_loss: 0.4839
Epoch 4/20
[1m2721/2721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 116ms/step - accuracy: 0.8115 - loss: 0.4920 - val_accuracy: 0.8177 - val_loss: 0.4756
Epoch 5/20
[1m2721/2721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - accuracy: 0.8158 - loss: 0.4809 - val_accuracy: 0.8225 - val_loss: 0.4700
Epoch 6/20
[1m2721/2721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - accuracy: 0.8208 - loss: 0.4724 - val_accuracy: 0.8248 - val_loss: 0.4653
Epoch 7/20
[

<keras.src.callbacks.history.History at 0x2c28950d0>

In [17]:
from tensorflow.keras.callbacks import LambdaCallback

# カスタムクラスを定義してKerasモデルをscikit-learnと統合
class KerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        log_callback = LambdaCallback(on_epoch_end=lambda epoch, logs: print(f"Epoch {epoch+1}: loss = {logs['loss']}, val_loss = {logs['val_loss']}"))
        self.model.fit(X, y, epochs=20, batch_size=32, verbose=1, validation_data=(X_val, y_val), callbacks=[early_stopping, log_callback])
        return self

    def predict(self, X):
        y_pred = self.model.predict(X)
        return np.argmax(y_pred, axis=1)

    def predict_proba(self, X):
        return self.model.predict(X)

keras_clf = KerasClassifier(model=create_nn_model(X_train_smote.shape[1]))
keras_clf.fit(X_train_smote, y_train_smote)

Epoch 1/20
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7704 - loss: 0.5804Epoch 1: loss = 0.5197274088859558, val_loss = 1.0454291105270386
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 16ms/step - accuracy: 0.7704 - loss: 0.5804 - val_accuracy: 0.5764 - val_loss: 1.0454
Epoch 2/20
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8235 - loss: 0.4663Epoch 2: loss = 0.4715453088283539, val_loss = 1.0312035083770752
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 16ms/step - accuracy: 0.8235 - loss: 0.4663 - val_accuracy: 0.5460 - val_loss: 1.0312
Epoch 3/20
[1m2175/2177[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.8229 - loss: 0.4640Epoch 3: loss = 0.46162864565849304, val_loss = 0.9958347678184509
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 15ms/step - accuracy: 0.8229 - loss: 0.4640 - val_accura

In [21]:
# アンサンブルモデルの定義
ensemble_model = VotingClassifier(estimators=[
    ('lgb', lgb_model),
    ('nn', keras_clf)
], voting='soft')

# 検証データでの予測
ensemble_model.fit(X_train_smote, y_train_smote)
y_pred_ensemble = ensemble_model.predict(X_val)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 44281
[LightGBM] [Info] Number of data points in the train set: 87069, number of used features: 350
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Epoch 1/20
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8501 - loss: 0.3969Epoch 1: loss = 0.4001912474632263, val_loss = 0.8521767854690552
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 16ms/step - accuracy: 0.8501 - loss: 0.3969 - val_accuracy: 0.6357 - val_loss: 0.8522
Epoch 2/20
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8498 - loss: 0.3945Epoch 2: loss = 0.

In [22]:
# 評価
accuracy_ensemble = accuracy_score(y_val, y_pred_ensemble)
print(f'Ensemble Validation Accuracy: {accuracy_ensemble}')
print(classification_report(y_val, y_pred_ensemble))

Ensemble Validation Accuracy: 0.8312859383167799
              precision    recall  f1-score   support

           0       0.89      0.84      0.87      5028
           1       0.65      0.60      0.62      3017
           2       0.86      0.92      0.89      7259

    accuracy                           0.83     15304
   macro avg       0.80      0.79      0.79     15304
weighted avg       0.83      0.83      0.83     15304



In [25]:
# テストデータに対する予測
test_preprocessed = test_preprocessed.drop('id', axis=1)
test_pred_ensemble = ensemble_model.predict(test_preprocessed)

# ターゲット値に変換
target_mapping = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}
final_predictions = [target_mapping[pred] for pred in test_pred_ensemble]

# 結果をデータフレームに格納
results_df = pd.DataFrame(data={"id": test_df['id'], "Target": final_predictions})

# 結果を保存
results_df.to_csv('submission_ensemble.csv', index=False)
print("Your submission was successfully saved!")

[1m1595/1595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Your submission was successfully saved!
