In [None]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import numpy as np

path = 'archive/dataset.csv'
df = pd.read_csv(path)

X = df.drop('Target', axis=1).values  # 特徵
y = df['Target'].values               # 標籤

le = LabelEncoder()
y = le.fit_transform(y)
print(y)  # (4209, 57) (4209, 56) (4209,)

In [None]:
df.head()

In [None]:

df = pd.read_csv(path)

binary_cols =['Daytime/evening attendance','Displaced','Educational special needs','Debtor'
              ,'Tuition fees up to date','Gender','Scholarship holder','International']
# df = pd.get_dummies(df, columns=binary_cols, drop_first=True)
# pd.set_option('display.max_columns', None)
df[binary_cols] = df[binary_cols]
df = pd.get_dummies(df, columns=binary_cols, drop_first=False)
df.to_csv('archive/dataset_binary_encoded.csv', index=False)
df.head()

In [None]:
path = 'archive/dataset.csv'
df = pd.read_csv(path)
path = 'train_set.csv'
train_df = pd.read_csv(path)
path = 'test_set.csv'
test_df = pd.read_csv(path)
print(df.shape, train_df.shape, test_df.shape)  # (3367, 57) (3367, 57) (842, 57)

In [None]:
#計算Target的類別數量的比例算出百分比
unique, counts = np.unique(y_test, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution in y_test:", class_distribution)

# 計算每個類別的比例
total = sum(counts)
class_percentage = {k: v / total * 100 for k, v in class_distribution.items()}
print("Class percentage in y_test:", class_percentage)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei']
df = pd.read_csv('archive/dataset.csv')

print("資料筆數與欄位數：", df.shape)
print("\n欄位資訊：")
print(df.info())
print("\n缺失值統計：")
print(df.isnull().sum())

In [None]:
#做Target欄位的長條圖
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Target', order=df['Target'].value_counts().index)
plt.title('Target欄位的類別分佈')
plt.xlabel('類別')
plt.ylabel('數量')
plt.xticks(rotation=45)
plt.show()

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\n數值欄位描述統計：")
print(df[num_cols].describe().T)

In [None]:
df = pd.read_csv('archive/dataset.csv')

binary_cols =['Gender','Daytime_evening_attendance','Displaced','Educational special needs','Debtor'
              ,'Tuition fees up to date','Scholarship holder','International']

categorical_cols = ['Marital status','Application mode','Course'
                    ,'Previous qualification','Nacionality','Mother qualification'
                    ,'Father qualification','Mother occupation'
                    ,'Father occupation']

numeric_cols =['Application order','Age at enrollment','Curricular units 1st sem (credited)'
               ,'Curricular units 1st sem (enrolled)','Curricular units 1st sem (evaluations)',
               'Curricular units 1st sem (approved)','Curricular units 1st sem (grade)',
               'Curricular units 1st sem (without evaluations)',
               'Curricular units 2nd sem (credited)','Curricular units 2nd sem (enrolled)',
               'Curricular units 2nd sem (evaluations)','Curricular units 2nd sem (approved)',
               'Curricular units 2nd sem (grade)','Curricular units 2nd sem (without evaluations)',
               'Unemployment rate','Inflation rate','GDP']


#查看欄位有沒有重複
all_cols = binary_cols + categorical_cols + numeric_cols
print("欄位總數：", len(all_cols))
print("欄位不重複數量：", len(set(all_cols)))
#輸出各類型的雷位數量
print(f"Binary columns: {len(binary_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")
print(f"Numeric columns: {len(numeric_cols)}")
#判斷上面的欄位有沒有出現在df裡面
for col in all_cols:
    if col not in df.columns:
        print(f"欄位 {col} 不在資料集中")

In [None]:
import os
#把binary_cols裡的欄位做成圖，並存在data_analyze資料夾裡
base = 'data_analyze'
if not os.path.exists(base):
    os.makedirs(base)
import matplotlib.pyplot as plt

for col in binary_cols:
    plt.figure()
    df[col].value_counts().plot(kind='bar')
    plt.title(f"{col}的類別分佈")
    plt.savefig(os.path.join(base, f"count_{col}.png"))
    plt.close()


In [None]:
#把分類的欄位做成圖，並存在data_analyze資料夾裡
for col in categorical_cols:
    plt.figure()
    order = df[col].value_counts().index
    sns.countplot(data=df, x=col, order=order)
    plt.title(f"{col}的類別分佈")
    plt.savefig(os.path.join(base, f"count_{col}.png"))
    plt.close()

In [None]:
#把數值欄位做成直方圖，並存在data_analyze資料夾裡
df = pd.read_csv('archive/dataset.csv')

for col in numeric_cols:
    plt.figure()
    sns.histplot(data=df, x=col, bins=30, kde=True)
    plt.title(f"{col}的分佈")
    plt.savefig(os.path.join(base, f"hist_{col}.png"))
    plt.close()

In [None]:
#針對Tuition fees up to date跟Target做交叉表分析，並算出比例*100%，並畫成圖，存在data_analyze資料夾裡

cross_tab = pd.crosstab(df['Tuition fees up to date'], df['Target'])
cross_tab = cross_tab.div(cross_tab.sum(axis=1), axis=0) * 100
print(cross_tab)

# 畫圖
plt.figure(figsize=(10, 6))
sns.heatmap(cross_tab, annot=True, fmt=".1f", cmap="YlGnBu")
plt.title("Tuition fees up to date vs Target")
plt.xlabel("Target")
plt.ylabel("Tuition fees up to date")
plt.savefig(os.path.join(base, "tuition_fees_vs_target.png"))
plt.show()

In [None]:
#根據debtor欄位畫出Target的長條圖，並把百分比寫出來
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=df, x='Target', hue='Debtor',
                   order=df['Target'].value_counts().index)
plt.title("Target Count by Debtor Status")
plt.xlabel("Target")
plt.ylabel("Count")
plt.legend(title="Debtor", loc="upper right")

# 每個 Target 的總數（用字串對應方便比對 xtick labels）
total = df['Target'].value_counts()
total_str = {str(k): int(v) for k, v in total.items()}

xticks = np.array(ax.get_xticks())
xtick_labels = [tl.get_text() for tl in ax.get_xticklabels()]

for p in ax.patches:
    x_center = p.get_x() + p.get_width() / 2.0
    idx = int(np.argmin(np.abs(xticks - x_center)))
    target_label = xtick_labels[idx]
    denom = total_str.get(target_label, 1)
    pct = (p.get_height() / denom) if denom > 0 else 0
    ax.annotate(f"{pct:.1%}", (x_center, p.get_height()),
                ha='center', va='bottom', fontsize=9, xytext=(0, 2), textcoords='offset points')
plt.savefig(os.path.join(base, "Target Count by Debtor Status.png"))
plt.tight_layout()
plt.show()


In [None]:
#根據Scholarship holder欄位畫出Target的長條圖，並把百分比寫出來
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=df, x='Target', hue='Scholarship holder',
                   order=df['Target'].value_counts().index)
plt.title("Target Count by Scholarship Holder Status")
plt.xlabel("Target")
plt.ylabel("Count")
plt.legend(title="Scholarship Holder", loc="upper right")

# 每個 Target 的總數（用字串對應方便比對 xtick labels）
total = df['Target'].value_counts()
total_str = {str(k): int(v) for k, v in total.items()}

xticks = np.array(ax.get_xticks())
xtick_labels = [tl.get_text() for tl in ax.get_xticklabels()]

for p in ax.patches:
    x_center = p.get_x() + p.get_width() / 2.0
    idx = int(np.argmin(np.abs(xticks - x_center)))
    target_label = xtick_labels[idx]
    denom = total_str.get(target_label, 1)
    pct = (p.get_height() / denom) if denom > 0 else 0
    ax.annotate(f"{pct:.1%}", (x_center, p.get_height()),
                ha='center', va='bottom', fontsize=9, xytext=(0, 2), textcoords='offset points')
plt.savefig(os.path.join(base, "Target Count by Scholarship Holder Status.png"))
plt.tight_layout()
plt.show()


In [None]:
#把數值型的資料去做皮爾斯相關係數分析，並畫圖，只留上三角形，並儲存到data_analyze資料夾裡

corr = df[numeric_cols].corr(method='pearson')
plt.figure(figsize=(12, 8))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar_kws={"shrink": .8}, mask=mask)
plt.title("Pearson Correlation Coefficient")
plt.savefig(os.path.join(base, "Pearson Correlation Coefficient on numeric_cols.png"))
plt.show()

In [None]:
#把數值型的資料去做斯皮爾曼相關係數分析，並畫圖，只留上三角形，並儲存到data_analyze資料夾裡
corr = df[numeric_cols].corr(method='spearman')
plt.figure(figsize=(12, 8))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar_kws={"shrink": .8}, mask=mask)
plt.title("Spearman Correlation Coefficient")
plt.savefig(os.path.join(base, "Spearman Correlation Coefficient on numeric_cols.png"))
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
path = r'C:\Users\H514 #4856\Desktop\deep learning 114206103\HW1\archive/dataset.csv'

binary_cols =['Gender','Daytime_evening_attendance','Displaced','Educational special needs','Debtor'
              ,'Tuition fees up to date','Scholarship holder','International']

categorical_cols = ['Marital status','Application mode','Course'
                    ,'Previous qualification','Nacionality','Mother qualification'
                    ,'Father qualification','Mother occupation'
                    ,'Father occupation']

numeric_cols =['Application order','Age at enrollment','Curricular units 1st sem (credited)'
               ,'Curricular units 1st sem (enrolled)','Curricular units 1st sem (evaluations)',
               'Curricular units 1st sem (approved)','Curricular units 1st sem (grade)',
               'Curricular units 1st sem (without evaluations)',
               'Curricular units 2nd sem (credited)','Curricular units 2nd sem (enrolled)',
               'Curricular units 2nd sem (evaluations)','Curricular units 2nd sem (approved)',
               'Curricular units 2nd sem (grade)','Curricular units 2nd sem (without evaluations)',
               'Unemployment rate','Inflation rate','GDP']

df = pd.read_csv(path)
print(f"Original DataFrame shape: {df.shape}")

# (1) 對 0/1 類別型做 One-Hot
df[binary_cols] = df[binary_cols].astype(str)
df = pd.get_dummies(df, columns=binary_cols, drop_first=True)

#對一般類別型欄位做 One-Hot
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# (3) 對數值型欄位做 Min-Max 正規化
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print(f"Transformed DataFrame shape: {df.shape}")
df.head()

In [None]:
df = pd.read_csv('archive/dataset.csv')

y= df['Target'].values               # 標籤
print(y)  # (4209, 57) (4209, 56) (4209,)
le = LabelEncoder()
y = le.fit_transform(y)
print(y)  # (4209, 57) (4209, 56) (4209,)

In [1]:
from Modle import MLP
model = MLP(intput_dim=236, num_classes=3)
print(model)

MLP(
  (fc1): Linear(in_features=236, out_features=256, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (act1): ReLU()
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (act2): ReLU()
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (dropout3): Dropout(p=0.3, inplace=False)
  (act3): ReLU()
  (fc4): Linear(in_features=64, out_features=3, bias=True)
)


In [None]:
task = 'different_optim'
base = os.path.join(r'C:\Users\H514 #4856\Desktop\deep learning 114206103\HW1', task)
print(base)
best_fold = 2
path = os.path.join(base,f"loss_curve_best_fold_{best_fold}.png")
print(path)

In [None]:
print(model)