In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import MDS, TSNE

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

import seaborn as sns
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [13]:
train_df = pd.read_csv("../train.csv")
test_df = pd.read_csv("../test.csv")

sub_df = pd.read_csv("../sample_submission.csv")

In [14]:
train_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,76518.0,,,,38258.5,22088.988286,0.0,19129.25,38258.5,57387.75,76517.0
Marital status,76518.0,,,,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,,,,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,,,,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,,,,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,,,,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,,,,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,,,,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,,,,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,,,,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0


In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [16]:
train_df.nunique().sort_values()

Gender                                                2
International                                         2
Scholarship holder                                    2
Tuition fees up to date                               2
Debtor                                                2
Daytime/evening attendance                            2
Educational special needs                             2
Displaced                                             2
Target                                                3
Marital status                                        6
Application order                                     8
Unemployment rate                                    11
Curricular units 2nd sem (without evaluations)       11
GDP                                                  11
Curricular units 1st sem (without evaluations)       12
Inflation rate                                       13
Nacionality                                          18
Course                                          

In [17]:
feature_list = [feature for feature in train_df.columns if  feature  != "Target"]

target = "Target"

binary_features = ['Scholarship holder','International','Gender','Tuition fees up to date',
                   'Daytime/evening attendance','Debtor','Educational special needs','Displaced']

#from data set description
categorical_features = ['Marital status', 'Unemployment rate', 'Curricular units 2nd sem (without evaluations)', 'GDP', 
                        'Curricular units 1st sem (without evaluations)', 'Inflation rate', 'Nacionality', 'Course',
                        'Curricular units 2nd sem (credited)', 'Previous qualification', 'Curricular units 2nd sem (approved)',
                        'Curricular units 1st sem (credited)', 'Application mode', 'Curricular units 2nd sem (enrolled)', 
                        'Curricular units 1st sem (approved)', 'Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
                        "Mother's qualification", 'Curricular units 1st sem (evaluations)', "Father's qualification", "Mother's occupation",
                        'Age at enrollment', "Father's occupation"]

train_df[categorical_features] = train_df[categorical_features].astype('category')
test_df[categorical_features] = test_df[categorical_features].astype('category')


numeric_features = list(set(feature_list) - set(binary_features)- set(categorical_features))

assert sorted(feature_list) == sorted(numeric_features + binary_features + categorical_features)

In [18]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

sc = StandardScaler()
oe = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', sc, numeric_features),
        ('cat', oe, categorical_features),
        ('bin', 'passthrough', binary_features)
    ])

In [19]:
# 前処理の実行
data_preprocessed = preprocessor.fit_transform(train_df)
test_preprocessed = preprocessor.transform(test_df)

# 疎行列を密行列に変換し、書き込み可能なコピーを作成
if hasattr(data_preprocessed, 'toarray'):
    data_preprocessed = data_preprocessed.toarray()
else:
    data_preprocessed = np.array(data_preprocessed, copy=True)

if hasattr(test_preprocessed, 'toarray'):
    test_preprocessed = test_preprocessed.toarray()
else:
    test_preprocessed = np.array(test_preprocessed, copy=True)



In [20]:
# OneHotEncoderによって生成された特徴量名を取得
encoded_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# 全ての特徴量名を結合
all_features = numeric_features + list(encoded_features) + binary_features

# 前処理されたデータをDataFrameに変換
df_preprocessed = pd.DataFrame(data_preprocessed, columns=all_features)
test_preprocessed = pd.DataFrame(test_preprocessed, columns=all_features)

In [21]:
X = df_preprocessed.drop('id', axis=1)
y = train_df.loc[:, 'Target']

label_mapping = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
encoded_y = y.map(label_mapping)

In [29]:
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np


# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, encoded_y, test_size=0.2, random_state=42)

# SMOTEによるオーバーサンプリング
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# LightGBMのモデル定義
lgb_params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'device': 'gpu',  # GPUを使用する設定
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

train_data = lgb.Dataset(X_train_smote, label=y_train_smote)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
callbacks = [lgb.early_stopping(stopping_rounds=10)]

lgb_model = lgb.train(lgb_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, callbacks=callbacks)

# Kerasのニューラルネットワークモデル定義
def create_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

nn_model = create_nn_model(X_train_smote.shape[1])
nn_model.fit(X_train_smote, y_train_smote, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# カスタムクラスを定義してKerasモデルをscikit-learnと統合
class KerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        self.model.fit(X, y, epochs=50, batch_size=32, verbose=0)
        return self

    def predict(self, X):
        y_pred = self.model.predict(X)
        return np.argmax(y_pred, axis=1)

    def predict_proba(self, X):
        return self.model.predict(X)

keras_clf = KerasClassifier(model=create_nn_model(X_train_smote.shape[1]))
keras_clf.fit(X_train_smote, y_train_smote)

# アンサンブルモデルの定義
ensemble_model = VotingClassifier(estimators=[
    ('lgb', lgb_model),
    ('nn', keras_clf)
], voting='soft')

# 検証データでの予測
ensemble_model.fit(X_train_smote, y_train_smote)
y_pred_ensemble = ensemble_model.predict(X_val)

# 評価
accuracy_ensemble = accuracy_score(y_val, y_pred_ensemble)
print(f'Ensemble Validation Accuracy: {accuracy_ensemble}')
print(classification_report(y_val, y_pred_ensemble))

# テストデータに対する予測
test_preprocessed = test_preprocessed.drop('id', axis=1)
test_pred_ensemble = ensemble_model.predict(test_preprocessed)

# ターゲット値に変換
target_mapping = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}
final_predictions = [target_mapping[pred] for pred in test_pred_ensemble]

# 結果をデータフレームに格納
results_df = pd.DataFrame(data={"id": test_df['id'], "Target": final_predictions})

# 結果を保存
results_df.to_csv('/mnt/data/submission_ensemble.csv', index=False)
print("Your submission was successfully saved!")



[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


LightGBMError: GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1

In [None]:
# テストデータに対する予測
test_preprocessed = test_preprocessed.drop('id', axis=1)
test_pred = ensemble_model.predict(test_preprocessed)

In [None]:
# ターゲット値に変換
target_mapping = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}
final_predictions = [target_mapping[np.argmax(pred)] for pred in test_pred]

In [None]:
# 結果をデータフレームに格納
results_df = pd.DataFrame(data={"id": test_df['id'], "Target": final_predictions})

print(results_df.head())

In [None]:
# 結果を保存
results_df.to_csv('submission10.csv', index=False)
print("Your submission was successfully saved!")