In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import MDS, TSNE

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

import seaborn as sns

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, Callback

In [2]:
train_df = pd.read_csv("../train.csv")
test_df = pd.read_csv("../test.csv")

sub_df = pd.read_csv("../sample_submission.csv")

In [3]:
train_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,76518.0,,,,38258.5,22088.988286,0.0,19129.25,38258.5,57387.75,76517.0
Marital status,76518.0,,,,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,,,,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,,,,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,,,,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,,,,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,,,,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,,,,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,,,,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,,,,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [5]:
train_df.nunique().sort_values()

Gender                                                2
International                                         2
Scholarship holder                                    2
Tuition fees up to date                               2
Debtor                                                2
Daytime/evening attendance                            2
Educational special needs                             2
Displaced                                             2
Target                                                3
Marital status                                        6
Application order                                     8
Unemployment rate                                    11
Curricular units 2nd sem (without evaluations)       11
GDP                                                  11
Curricular units 1st sem (without evaluations)       12
Inflation rate                                       13
Nacionality                                          18
Course                                          

In [6]:
test_df.nunique().sort_values()

Gender                                                2
International                                         2
Daytime/evening attendance                            2
Scholarship holder                                    2
Tuition fees up to date                               2
Debtor                                                2
Educational special needs                             2
Displaced                                             2
Marital status                                        6
Application order                                     8
Curricular units 2nd sem (without evaluations)       10
GDP                                                  10
Curricular units 1st sem (without evaluations)       11
Unemployment rate                                    12
Inflation rate                                       12
Nacionality                                          18
Curricular units 2nd sem (credited)                  19
Application mode                                

In [7]:
feature_list = [feature for feature in train_df.columns if  feature  != "Target"]

target = "Target"

binary_features = ['Scholarship holder','International','Gender','Tuition fees up to date',
                   'Daytime/evening attendance','Debtor','Educational special needs','Displaced']

#from data set description
categorical_features = ['Marital status', 'Unemployment rate', 'Curricular units 2nd sem (without evaluations)', 'GDP', 
                        'Curricular units 1st sem (without evaluations)', 'Inflation rate', 'Nacionality', 'Course',
                        'Curricular units 2nd sem (credited)', 'Previous qualification', 'Curricular units 2nd sem (approved)',
                        'Curricular units 1st sem (credited)', 'Application mode', 'Curricular units 2nd sem (enrolled)', 
                        'Curricular units 1st sem (approved)', 'Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
                        "Mother's qualification", 'Curricular units 1st sem (evaluations)', "Father's qualification", "Mother's occupation",
                        'Age at enrollment', "Father's occupation"]

train_df[categorical_features] = train_df[categorical_features].astype('category')
test_df[categorical_features] = test_df[categorical_features].astype('category')


numeric_features = list(set(feature_list) - set(binary_features)- set(categorical_features))

assert sorted(feature_list) == sorted(numeric_features + binary_features + categorical_features)

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

sc = StandardScaler()
oe = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', sc, numeric_features),
        ('cat', oe, categorical_features),
        ('bin', 'passthrough', binary_features)
    ])

In [9]:
data_preprocessed = preprocessor.fit_transform(train_df)
test_preprocessed = preprocessor.transform(test_df)
# 疎行列を密行列に変換
if hasattr(data_preprocessed, 'toarray'):
    data_preprocessed = data_preprocessed.toarray()

if hasattr(test_preprocessed, 'toarray'):
    test_preprocessed = test_preprocessed.toarray()



In [10]:
# OneHotEncoderによって生成された特徴量名を取得
encoded_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# 全ての特徴量名を結合
all_features = numeric_features + list(encoded_features) + binary_features

# 前処理されたデータをDataFrameに変換
df_preprocessed = pd.DataFrame(data_preprocessed, columns=all_features)

In [11]:
X = df_preprocessed.drop('id', axis=1)
y = train_df.loc[:, 'Target']

label_mapping = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
encoded_y = y.map(label_mapping)

In [12]:
encoded_y.shape

(76518,)

In [13]:
# モデルの構築
model = tf.keras.models.Sequential()
model.add(layers.Dense(512))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(256))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(128))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(3, activation='softmax'))

2024-06-26 14:41:43.612450: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-06-26 14:41:43.612474: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-06-26 14:41:43.612491: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-06-26 14:41:43.612897: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-26 14:41:43.612922: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [14]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 途中経過(loss値)を確認するためのクラス
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.val_losses = []

    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))

# コールバックの設定
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
history = LossHistory()

# モデルの訓練
epochs = 10
history = model.fit(X, encoded_y, epochs=epochs, validation_split=0.25, callbacks=[early_stopping, history])

Epoch 1/10


2024-06-26 14:41:44.738729: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m1794/1794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 38ms/step - accuracy: 0.7684 - loss: 0.5970 - val_accuracy: 0.8210 - val_loss: 0.4606
Epoch 2/10
[1m1794/1794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 38ms/step - accuracy: 0.8185 - loss: 0.4848 - val_accuracy: 0.8229 - val_loss: 0.4592
Epoch 3/10
[1m1794/1794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 38ms/step - accuracy: 0.8233 - loss: 0.4719 - val_accuracy: 0.8241 - val_loss: 0.4576
Epoch 4/10
[1m1794/1794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 38ms/step - accuracy: 0.8273 - loss: 0.4592 - val_accuracy: 0.8233 - val_loss: 0.4597
Epoch 5/10
[1m1794/1794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 38ms/step - accuracy: 0.8303 - loss: 0.4479 - val_accuracy: 0.8255 - val_loss: 0.4548
Epoch 6/10
[1m1794/1794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 38ms/step - accuracy: 0.8331 - loss: 0.4455 - val_accuracy: 0.8244 - val_loss: 0.4587
Epoch 7/10
[1m

In [15]:
# 前処理されたデータをDataFrameに変換
test_preprocessed = pd.DataFrame(test_preprocessed, columns=all_features)
test_preprocessed = test_preprocessed.drop('id', axis=1)

In [16]:
# モデルの予測結果を取得
test_pred = model.predict(test_preprocessed)

# ソフトマックス出力の最大値を持つクラスを取得
predicted_classes = np.argmax(test_pred, axis=1)

[1m1595/1595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step


In [17]:
# ターゲット値に変換
target_mapping = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}
inverse_target_mapping = {v: k for k, v in target_mapping.items()}
final_predictions = [target_mapping[pred] for pred in predicted_classes]

In [18]:
# 結果をデータフレームに格納
results_df = pd.DataFrame(data={"id":test_df.id, "Target": final_predictions})

print(results_df.head())

      id    Target
0  76518   Dropout
1  76519  Graduate
2  76520  Graduate
3  76521  Graduate
4  76522   Dropout


In [19]:
results_df.shape

(51012, 2)

In [20]:
# 結果を保存
results_df.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
