# MODELING


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold

In [4]:
df = pd.read_csv('/data/xuli.csv')

In [5]:
data = df.drop(columns='course_id')

In [6]:
# Tách dữ liệu đã gán nhãn và chưa gán nhãn
labeled_data = data[data['label'].notna()]
labeled_data

Unnamed: 0,attempts,score_rate,teacher_rank,school_rank,user's_school_rank,sentiment,num_of_user,label
0,1.275709,0.0,4.0,5.000000,3.000000,0.000000,3,2.0
1,2.564507,0.0,5.0,5.000000,2.833333,0.000000,6,2.0
2,2.242843,0.0,4.0,5.000000,2.200000,0.000000,5,1.0
3,1.842890,0.0,4.0,5.000000,3.000000,0.000000,10,2.0
4,2.988910,0.0,4.0,3.000000,2.750000,0.000000,8,2.0
...,...,...,...,...,...,...,...,...
376,1.084794,0.0,4.0,2.000000,1.857590,4.142857,1902,2.0
377,1.611571,0.0,4.0,2.000000,1.535141,0.000000,3602,2.0
378,1.401856,0.0,4.0,2.000000,1.000000,0.000000,1,1.0
379,2.396355,0.0,3.0,2.000000,1.856505,3.731707,3628,2.0


In [7]:
unlabeled_data = data[data['label'].isnull()]
unlabeled_data

Unnamed: 0,attempts,score_rate,teacher_rank,school_rank,user's_school_rank,sentiment,num_of_user,label
324,2.300332,0.825000,3.0,2.000000,1.843256,0.0,820,
326,0.000000,-1.000000,4.0,5.000000,1.879075,0.0,945,
327,2.518933,0.000000,3.0,2.000000,1.710000,4.0,4591,
329,1.000000,0.735557,4.0,2.548461,1.813931,0.0,2067,
330,1.349917,0.553846,4.0,2.000000,2.009379,0.0,825,
...,...,...,...,...,...,...,...,...
3776,0.000000,-1.000000,5.0,5.000000,1.000000,0.0,1,
3777,0.000000,-1.000000,4.0,5.000000,1.000000,0.0,1,
3778,0.000000,-1.000000,5.0,5.000000,1.000000,0.0,1,
3779,1.000000,0.028672,5.0,2.677882,1.673913,0.0,184,


In [8]:
# Tách đặc trưng và nhãn từ dữ liệu đã gán nhãn
X_labeled = labeled_data.drop('label', axis=1)
y_labeled = labeled_data['label']

In [9]:
# Chia dữ liệu đã gán nhãn thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.3, random_state=42, stratify=y_labeled)

In [10]:
X_test

Unnamed: 0,attempts,score_rate,teacher_rank,school_rank,user's_school_rank,sentiment,num_of_user
224,0.000000,-1.000000,4.0,3.921929,1.802632,0.0,76
211,2.409624,0.000000,4.0,2.000000,1.861223,0.0,2716
283,1.000000,0.693878,4.0,1.234272,1.977404,0.0,1693
322,0.000000,-1.000000,3.0,2.000000,1.000000,0.0,1
132,2.226238,0.100000,3.0,2.530853,1.808767,0.0,2072
...,...,...,...,...,...,...,...
120,1.344404,0.000000,5.0,5.000000,1.000000,0.0,1
356,0.000000,-1.000000,4.0,4.910388,1.750000,0.0,4
60,1.075292,0.000000,4.0,5.000000,2.464494,0.0,9012
50,2.825250,0.000000,4.0,5.000000,2.000000,0.0,2


In [11]:
# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Thiết lập các giá trị tham số để tìm kiếm
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear', 'poly']
}

In [13]:
# Tạo mô hình SVM
svm = SVC(random_state=42)

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
grid_search = GridSearchCV(svm, param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [16]:
# In ra các siêu tham số tốt nhất
print(f'Best parameters found: {grid_search.best_params_}')
best_model = grid_search.best_estimator_

Best parameters found: {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}


In [17]:
# Đánh giá mô hình sử dụng K-Fold Cross-Validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=kf)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Average Cross-Validation Score: {cv_scores.mean()}')

Cross-Validation Scores: [0.8490566  0.88461538 0.82692308 0.92307692 0.88461538]
Average Cross-Validation Score: 0.873657474600871


In [18]:
# Pseudo-labeling
X_unlabeled = unlabeled_data.drop('label', axis=1)
X_unlabeled_scaled = scaler.transform(X_unlabeled)

In [19]:
iterations = 5
for i in range(iterations):
    # Dự đoán nhãn giả định
    pseudo_labels = best_model.predict(X_unlabeled_scaled)
    confidence_scores = best_model.decision_function(X_unlabeled_scaled)

    # Lọc nhãn giả định có độ tin cậy cao
    high_confidence_mask = np.max(confidence_scores, axis=1) > 1.0
    X_pseudo_labeled = X_unlabeled_scaled[high_confidence_mask]
    y_pseudo_labeled = pseudo_labels[high_confidence_mask]

    # Kết hợp dữ liệu đã gán nhãn và dữ liệu nhãn giả định có độ tin cậy cao
    X_combined = np.vstack((X_train_scaled, X_pseudo_labeled))
    y_combined = np.hstack((y_train, y_pseudo_labeled))

    # Huấn luyện lại mô hình tốt nhất trên dữ liệu kết hợp
    best_model.fit(X_combined, y_combined)

    # Đánh giá mô hình sử dụng K-Fold Cross-Validation sau mỗi lần lặp
    cv_scores = cross_val_score(best_model, X_combined, y_combined, cv=kf)
    print(f'Iteration {i+1}, Cross-Validation Scores: {cv_scores}')
    print(f'Iteration {i+1}, Average Cross-Validation Score: {cv_scores.mean()}')

Iteration 1, Cross-Validation Scores: [0.98228883 0.99182561 0.99318801 0.99863574 0.98908595]
Iteration 1, Average Cross-Validation Score: 0.9910048287988223
Iteration 2, Cross-Validation Scores: [0.98637602 0.98910082 0.99455041 1.         0.98635744]
Iteration 2, Average Cross-Validation Score: 0.991276936630844
Iteration 3, Cross-Validation Scores: [0.98773842 0.98910082 0.99455041 0.99863574 0.99181446]
Iteration 3, Average Cross-Validation Score: 0.9923679700830078
Iteration 4, Cross-Validation Scores: [0.98773842 0.99318801 0.99182561 1.         0.99181446]
Iteration 4, Average Cross-Validation Score: 0.992913300943084
Iteration 5, Cross-Validation Scores: [0.98637602 0.99318801 0.99318801 0.99727149 0.98772169]
Iteration 5, Average Cross-Validation Score: 0.9915490444628657


In [20]:
# Đánh giá cuối cùng trên tập kiểm tra
y_test_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Final Test Accuracy: {test_accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_test_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_test_pred))

Final Test Accuracy: 0.911504424778761
Classification Report:
              precision    recall  f1-score   support

         1.0       0.84      0.94      0.89        34
         2.0       0.89      0.85      0.87        39
         3.0       1.00      0.95      0.97        40

    accuracy                           0.91       113
   macro avg       0.91      0.91      0.91       113
weighted avg       0.92      0.91      0.91       113

Confusion Matrix:
[[32  2  0]
 [ 6 33  0]
 [ 0  2 38]]


In [27]:
import joblib

# Lưu xuống mô hình tốt nhất
joblib.dump(best_model, 'best_model.joblib')

['best_model.joblib']

In [22]:
X_test['label'] = y_test_pred
X_test.shape

(113, 8)