In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv', sep=';')
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [5]:
df.dtypes

Marital status                                      int64
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance\t                        int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                         int64
Mother's qualification                              int64
Father's qualification                              int64
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Displaced                                           int64
Educational special needs                           int64
Debtor                                              int64
Tuition fees up to date                             int64
Gender        

# processing data

In [6]:
def encode_target(value:str) -> int:
    if value=='Dropout':
        return [1, 0, 0]
    elif value=='Enrolled':
        return [0, 1, 0]
    else:
        return [0, 0, 1]

In [7]:
df.isna().sum(axis=0)

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance\t                      0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

In [8]:
df['Target']= df['Target'].apply(encode_target)

In [9]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,"[0, 0, 1]"
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,"[0, 0, 1]"
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,"[0, 0, 1]"


normalization

In [10]:
normalized_data = df.drop(columns=['Target'])
normalized_data = (normalized_data - normalized_data.mean())/normalized_data.std()

In [11]:
normalized_data['Target'] = df['Target']

In [12]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,"[0, 0, 1]"
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,"[0, 0, 1]"
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,"[0, 0, 1]"


In [13]:
normalized_data.dtypes

Marital status                                    float64
Application mode                                  float64
Application order                                 float64
Course                                            float64
Daytime/evening attendance\t                      float64
Previous qualification                            float64
Previous qualification (grade)                    float64
Nacionality                                       float64
Mother's qualification                            float64
Father's qualification                            float64
Mother's occupation                               float64
Father's occupation                               float64
Admission grade                                   float64
Displaced                                         float64
Educational special needs                         float64
Debtor                                            float64
Tuition fees up to date                           float64
Gender        

Constructing input & output variable 

In [14]:
X = df.iloc[:, :-1]
X.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0,0.0,0,10.8,1.4,1.74
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,0,6,0,0,0.0,0,10.8,1.4,1.74
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,0,6,6,6,13.0,0,13.9,-0.3,0.79


In [15]:
y = normalized_data.iloc[:, -1]
y.value_counts()

Target
[0, 0, 1]    2209
[1, 0, 0]    1421
[0, 1, 0]     794
Name: count, dtype: int64

In [16]:
X = X.to_numpy()
X.shape

(4424, 36)

In [17]:
y = np.array(y.tolist())
ch_y = np.zeros((y.shape[0], 3))
for i in range(y.shape[0]):
    ch_y[i, y[i]] = 1
y.shape

(4424, 3)

# feature engineeringengineering

In [18]:
# Tính ma trận tương quan
num_features = len(df.columns) - 1
cov = np.zeros((num_features, num_features))
cov = df.iloc[:, :-1].corr()

In [19]:
# tim các tập tương quan mạnh
correlation_pairs = cov.unstack()
strong_correlations = correlation_pairs[(correlation_pairs >= 0.8) & (correlation_pairs < 1)]

In [20]:
strong_correlations = strong_correlations.reset_index()
strong_correlations.columns = ["Feature 1", "Feature 2", "Correlation"]
strong_correlations = strong_correlations[strong_correlations["Feature 1"] < strong_correlations["Feature 2"]]

In [21]:
strong_correlations.head()

Unnamed: 0,Feature 1,Feature 2,Correlation
1,Father's occupation,Mother's occupation,0.910472
2,Curricular units 1st sem (credited),Curricular units 2nd sem (credited),0.944811
3,Curricular units 1st sem (enrolled),Curricular units 2nd sem (enrolled),0.942627
4,Curricular units 1st sem (approved),Curricular units 2nd sem (approved),0.904002
5,Curricular units 1st sem (grade),Curricular units 2nd sem (grade),0.83717


In [22]:
columns_to_drop = ["Father's occupation",
                    "Curricular units 1st sem (credited)",
                   "Curricular units 1st sem (enrolled)", 
                   "Curricular units 1st sem (approved)",
                   "Curricular units 1st sem (grade)"]
df_reduced = df.drop(columns=columns_to_drop)
df_reduced.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,"[0, 0, 1]"
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,"[0, 0, 1]"
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,"[0, 0, 1]"


In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_reduced.iloc[:, :-1])  # Chuẩn hóa đặc trưng

In [24]:
# Mã hóa cột phân loại 
course_unique = sorted(df_reduced["Course"].unique())  # Nếu có cột "Course"
course_mapping = {course: i for i, course in enumerate(course_unique)}
df_reduced["Course"] = df_reduced["Course"].map(course_mapping)

In [25]:
df_reduced.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,1,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
1,1,15,1,10,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,"[0, 0, 1]"
2,1,1,5,4,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
3,1,17,2,14,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,"[0, 0, 1]"
4,2,39,1,2,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,"[0, 0, 1]"


# Softmax Regression

In [26]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

class SoftmaxRegression:
    def __init__(self, epoch: int, lr: float) -> None:
        self.epoch = epoch
        self.lr = lr
        self.losses = []
        self.metrics = []
    
    def softmax(self, z: np.ndarray) -> np.ndarray:
        exp_scores = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    def loss_fn(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        eps = 1e-9
        return -(y * np.log(y_hat + eps)).sum(axis=1).mean()
    
    def accuracy(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        return (np.argmax(y, axis=1) == np.argmax(y_hat, axis=1)).mean()
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        n, d = X.shape
        n_classes = y.shape[1]

        X = np.hstack([np.ones((n, 1)), X])  # Them bias vao X
        self.theta = np.zeros((d + 1, n_classes))

        with tqdm(range(self.epoch)) as pb:
            for e in pb:
                pb.set_description(f"Epoch {e+1}")
                s = self.predict(X)
                s = np.array(s)
                if s.shape != y.shape:
                    raise ValueError(f"Shape mismatch: s.shape = {s.shape}, y.shape = {y.shape}")
                # Tinh gradient va cap nhat theta
                gradient = (1/n) * np.matmul(X.T, (s - y))
                self.theta -= self.lr * gradient
                loss = self.loss_fn(y, s)
                acc = self.accuracy(y, s)
                self.losses.append(loss)
                self.metrics.append(acc)
                pb.set_description(f"Epoch {e+1} - Loss: {loss:.4f}, Acc: {acc:.4f}")

    def predict(self, X: np.ndarray) -> np.ndarray:
        # Du doan xac suat
        if X.shape[1] == self.theta.shape[0] - 1:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
        p = self.softmax(np.dot(X, self.theta))
        return np.asarray(p)

    def predict_classes(self, X: np.ndarray) -> np.ndarray:
        # Du doan nhan
        if X.shape[1] == self.theta.shape[0] - 1:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
        return np.argmax(self.predict(X), axis=1)


Chia thành 2 tập Train, Test tỉ lệ 8:2

In [27]:
sm_reg = SoftmaxRegression(epoch=1000, lr=0.2)

In [28]:
def split(X, y, train_size = 0.8):
    X = np.array(X)
    y = np.array(y)

    num_samples = X.shape[0]
    train_size = int(train_size * num_samples)

    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    return X_train, X_test, y_train, y_test

In [29]:
X_train, X_test, y_train, y_test = split(X, y, train_size=0.8)

In [30]:
sm_reg.fit(X_train, y_train)

Epoch 1000 - Loss: 14.1942, Acc: 0.3151: 100%|██████████| 1000/1000 [00:05<00:00, 166.86it/s]


In [31]:
# Dự đoán và tính độ chính xác cho tập huấn luyện
y_train_hat = sm_reg.predict(X_train)
acc_train = sm_reg.accuracy(y_train, y_train_hat)
print(f'Training Accuracy: {acc_train:.4f}')

Training Accuracy: 0.5044


In [32]:
# Dự đoán và tính độ chính xác cho tập kiểm tra
y_test_hat = sm_reg.predict(X_test)
acc_test = sm_reg.accuracy(y_test, y_test_hat)
print(f'Test Accuracy: {acc_test:.4f}')

Test Accuracy: 0.4791


# sklearn (3)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [34]:
X = normalized_data.iloc[:, :-1].to_numpy()
y = normalized_data.iloc[:, -1].to_numpy()

In [35]:
y = np.vstack(y) 
print(y.shape)
print(y[:5])

(4424, 3)
[[1 0 0]
 [0 0 1]
 [1 0 0]
 [0 0 1]
 [0 0 1]]


In [36]:
y = np.argmax(y, axis=1) # Chuyển đổi nhãn từ one-hot encoding về nhãn dạng số nguyên

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
softmax_reg = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')


In [39]:
softmax_reg.fit(X_train, y_train)



In [40]:
# Dự đoán trên tập huấn luyện và kiểm tra
y_train_pred = softmax_reg.predict(X_train)
y_test_pred = softmax_reg.predict(X_test)

In [41]:
# Đánh giá độ chính xác
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [42]:
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.7785
Test Accuracy: 0.7525
