In [127]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import sparse

In [128]:
data_path = r"train_values.csv"
labels_path = r"train_labels.csv"

In [129]:
# Đọc dữ liệu từ tệp CSV vào DataFrame
data_df = pd.read_csv(data_path)
labels_df = pd.read_csv(labels_path)

In [130]:
data_df

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,n,r,...,0,0,0,0,0,0,0,0,0,0
260597,669485,17,715,2060,2,0,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
260598,602512,17,51,8163,3,55,6,7,t,r,...,0,0,0,0,0,0,0,0,0,0
260599,151409,26,39,1851,2,10,14,6,t,r,...,0,0,0,0,0,0,0,0,0,0


In [131]:
categorical_variables = data_df.select_dtypes(include=['object']).columns
for col in categorical_variables:
    print(col)

land_surface_condition
foundation_type
roof_type
ground_floor_type
other_floor_type
position
plan_configuration
legal_ownership_status


In [132]:
data_df = pd.get_dummies(data_df, columns=categorical_variables)
for col in data_df.columns:
    if data_df[col].dtype == 'bool':
        data_df[col] = data_df[col].astype(int)
data_df

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,802906,6,487,12198,2,30,6,5,1,1,...,0,0,0,0,0,0,0,0,1,0
1,28830,8,900,2812,2,10,8,7,0,1,...,0,0,0,0,0,0,0,0,1,0
2,94947,21,363,8973,2,10,5,5,0,1,...,0,0,0,0,0,0,0,0,1,0
3,590882,22,418,10694,2,10,6,5,0,1,...,0,0,0,0,0,0,0,0,1,0
4,201944,11,131,1488,3,30,8,9,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,0,1,...,0,0,0,1,0,0,0,0,1,0
260597,669485,17,715,2060,2,0,6,5,0,1,...,0,0,0,0,0,0,0,0,1,0
260598,602512,17,51,8163,3,55,6,7,0,1,...,0,0,0,0,0,0,0,0,1,0
260599,151409,26,39,1851,2,10,14,6,0,0,...,0,0,0,0,0,0,0,0,1,0


### Áp dụng hồi quy logistic đa biến

In [133]:
data_df = data_df.drop(['building_id'], axis=1)
train_labels = labels_df.drop(['building_id'], axis=1)

In [134]:
# Chuẩn hóa dữ liệu
scaler = StandardScaler()
data_df_scaled = scaler.fit_transform(data_df)

In [135]:
pca = PCA(n_components=10)
print(data_df.shape)
data_df_pca = pca.fit_transform(data_df_scaled)
print(data_df_pca.shape)
# Tính tổng phương sai được giải thích bởi các thành phần chính
explained_variance = sum(pca.explained_variance_ratio_)

# Tính tỷ lệ phương sai bị mất
lost_variance = 1 - explained_variance

# In kết quả
print(f"Tỷ lệ phương sai được giải thích: {explained_variance * 100:.2f}%")
print(f"Tỷ lệ phương sai bị mất: {lost_variance * 100:.2f}%")

(260601, 68)
(260601, 10)
Tỷ lệ phương sai được giải thích: 36.45%
Tỷ lệ phương sai bị mất: 63.55%


In [136]:
X_train, X_test, y_train, y_test = train_test_split(data_df_pca, train_labels.values.flatten(), random_state=42, test_size=0.2)

In [137]:
def convert_labels(y, C):
    """
    Convert 1d label to a matrix label: each column of this
    matrix corresponding to 1 element in y. In the i-th column of Y,
    only one non-zero element located in the y[i]-th position,
    and = 1 ex: y = [0, 2, 1, 0], and 3 classes then return

        [[1, 0, 0, 1],
         [0, 0, 1, 0],
         [0, 1, 0, 0]]
    """
    Y = sparse.coo_matrix((np.ones_like(y), (y, np.arange(len(y)))), shape=(C, len(y))).toarray()
    return Y

def softmax_stable(Z):
    """
    Compute softmax values for each set of scores in Z.
    Each column of Z is a set of scores.
    """
    e_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    A = e_Z / e_Z.sum(axis=0, keepdims=True)
    return A

def softmax(Z):
    """
    Compute softmax values for each set of scores in Z.
    Each column of Z is a set of scores.
    """
    Z = Z - np.max(Z, axis=0, keepdims=True)
    e_Z = np.exp(Z)
    A = e_Z / e_Z.sum(axis=0, keepdims=True)
    return A

# Cost or loss function
def cost(X, Y, W):
    A = softmax(W.T.dot(X))
    return -np.sum(Y * np.log(A))

def softmax_regression(X, y, W_init, eta, tol=1e-4, max_count=1000):
    from sklearn.metrics import log_loss
    W = [W_init]
    C = W_init.shape[1]
    Y = convert_labels(y, C)
    N = X.shape[1]
    d = X.shape[0]
    count = 0
    check_w_after = 20
    
    while count < max_count:
        # Mix data
        mix_id = np.random.permutation(N)
        for i in mix_id:
            xi = X[:, i].reshape(d, 1)
            yi = Y[:, i].reshape(C, 1)
            ai = softmax(np.dot(W[-1].T, xi))
            W_new = W[-1] + eta * xi.dot((yi - ai).T)
            count += 1
            # Stopping criteria
            if count % check_w_after == 0:
                if np.linalg.norm(W_new - W[-check_w_after]) < tol:
                    return W
            W.append(W_new)

    return W

# Predict that X belongs to which class (1..C now indexed as 0..C-1)
def pred(W, X):
    """
    Predict output of each column of X
    Class of each x_i is determined by location of max probability
    Note that classes are indexed by [0, 1, 2, ..., C-1]
    """
    A = softmax_stable(W.T.dot(X.T))
    return np.argmax(A, axis=0)


In [138]:
print("Shape of train:", X_train.shape)
print("Shape of test:", X_test.shape)

Shape of train: (208480, 10)
Shape of test: (52121, 10)


In [139]:
X_train

array([[-0.1935455 , -0.31859422,  1.43385323, ...,  1.08064615,
         1.1141886 ,  0.21459776],
       [ 0.38847884,  2.75006236, -0.70648176, ...,  0.96451564,
        -1.20083696,  0.9916702 ],
       [-1.26555416, -0.64521633,  1.86717247, ...,  5.82492117,
         2.02938391, -0.59635421],
       ...,
       [ 0.36117954,  1.81559602,  0.13514058, ..., -0.91113967,
        -0.66140412, -1.67592326],
       [-0.86503812,  1.4201035 , -0.04972241, ...,  0.4580674 ,
        -0.38708912, -0.68250442],
       [-1.46176161,  0.23858224, -1.14867359, ...,  0.48329544,
         0.82518892,  0.32175066]])

In [140]:
# Initialize parameters
W_init = np.random.randn(X_train.shape[1], 3)  # 3 classes
eta = 0.01

# Train softmax regression model
W = softmax_regression(X_train.T, y_train-1, W_init, eta)[-1]

# Predict labels for the test set
y_pred = pred(W, X_test)

# Print predictions and true labels
print("Dự đoán:", y_pred)
print("Thực tế:", y_test)

Dự đoán: [2 2 1 ... 1 1 2]
Thực tế: [2 2 2 ... 3 3 2]


In [141]:
from sklearn.metrics import classification_report

report = classification_report(y_test-1, y_pred)
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.67      0.41      5170
           1       0.60      0.25      0.36     29487
           2       0.40      0.64      0.49     17464

    accuracy                           0.42     52121
   macro avg       0.43      0.52      0.42     52121
weighted avg       0.51      0.42      0.41     52121



In [142]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test-1, y_pred))

[[ 3480   510  1180]
 [ 6562  7455 15470]
 [ 1852  4398 11214]]


### Code thư viện


In [143]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression model
logistic_regression = LogisticRegression(solver='saga', multi_class='multinomial', max_iter = 1000)

logistic_regression.fit(X_train, y_train)



In [144]:
from sklearn.metrics import classification_report

# Predict labels for the test set
y_pred_test = logistic_regression.predict(X_test)
    
# Calculate evaluation metrics
report = classification_report(y_test, y_pred_test)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           1       0.55      0.24      0.33      5170
           2       0.57      0.94      0.71     29487
           3       0.47      0.04      0.08     17464

    accuracy                           0.57     52121
   macro avg       0.53      0.41      0.38     52121
weighted avg       0.54      0.57      0.46     52121

