In [128]:
# !python -m pip uninstall scikit-learn --yes

In [129]:
# pip install scikit-learn==1.2.2 --user

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [131]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [132]:
# import sklearn
# print(sklearn.__version__)

In [133]:
from imblearn.over_sampling import SMOTENC
from sklearn.utils import resample

In [134]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [135]:
file_path = './data/data_iu_proccessed_p3.csv'
# file_path = './data/data_iu_proccessed_p3_v2.csv'

# Read the CSV file with specified data types
df = pd.read_csv(file_path)

In [136]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,TenMH,MaMH,Major,SoTinChi,SiSoTKB,ClassifyAs,TKB_LT2,TKB_LT3,TKB_LT4,...,TKB_LT7,TKB_TH2,TKB_TH3,TKB_TH4,TKB_TH5,TKB_TH6,TKB_TH7,HK,GVLT,GVTT
0,0,Kinh tế hành vi và chính sách công,PM5110,BA BA,2,50.0,4,0,0,0,...,0,0,0,0,0,0,0,2,Nguyen Van Phuong,unassigned
1,1,Kinh tế hành vi và chính sách công,PM5110,BA BA,2,61.0,2,0,0,0,...,0,0,0,0,0,0,0,2,Nguyen Van Phuong,unassigned
2,2,Kinh tế hành vi và chính sách công,PM5110,BA BA,2,14.0,1,0,0,0,...,7,0,0,0,0,0,0,3,Nguyen Van Phuong,unassigned
3,3,Kinh tế lượng và phân tích chính sách,PM5214,BA BA,2,50.0,4,0,0,0,...,0,0,0,0,0,0,0,2,Nguyen Phuong Anh,unassigned
4,4,Kỹ năng lãnh đạo trong khu vực công,PM5212,BA BA,2,41.0,2,0,0,0,...,7,0,0,0,0,0,0,1,Trinh Viet Dung,unassigned


In [137]:
# Remove the specified columns
columns_to_remove = ['Unnamed: 0', 'TenMH']  # Columns to be removed
df.drop(columns=columns_to_remove, inplace=True)
df.dtypes

MaMH           object
Major          object
SoTinChi        int64
SiSoTKB       float64
ClassifyAs      int64
TKB_LT2         int64
TKB_LT3         int64
TKB_LT4         int64
TKB_LT5         int64
TKB_LT6         int64
TKB_LT7         int64
TKB_TH2         int64
TKB_TH3         int64
TKB_TH4         int64
TKB_TH5         int64
TKB_TH6         int64
TKB_TH7         int64
HK              int64
GVLT           object
GVTT           object
dtype: object

In [138]:
df.shape

(8332, 20)

In [139]:
#-----------4 Classes--------------
# #Create a dictionary to map values
# class_mapping = {1: 'Class1', 2: 'Class2', 3: 'Class3', 4: 'Class4'}

# # Map the values in the 'ClassifyAs' column
# df['ClassifyAs'] = df['ClassifyAs'].map(class_mapping)

# #-----------2 Classes (Over90)--------------
# # Create a dictionary to map values
# class_mapping = {1: 'Class1', 2: 'Class1', 3: 'Class2', 4: 'Class2'}

# # Map the values in the 'ClassifyAs' column
# df['ClassifyAs'] = df['ClassifyAs'].map(class_mapping)

#---------- 2 Classes (TR89-Spread)-------------
# Create a dictionary to map values
class_mapping = {1: 'Class1', 2: 'Class2', 3: 'Class1', 4: 'Class2'}

# Map the values in the 'ClassifyAs' column
df['ClassifyAs'] = df['ClassifyAs'].map(class_mapping)

In [140]:
# Count the occurrences of each class
class_counts = df['ClassifyAs'].value_counts()

# Print the counts
print(class_counts)

ClassifyAs
Class2    6553
Class1    1779
Name: count, dtype: int64


In [141]:
#===========================================================================================================
#-------------------------Normal / Class Weight-------------------------------------------------------------
#===========================================================================================================

# Split the data into features (X) and target (y)
X = df.drop(columns=['ClassifyAs'])  # Features
y = df['ClassifyAs']  # Target

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define custom class weights
# Total number of samples
# total_samples = 1050 + 3437 + 751 + 3749

# # Calculate class weights
# weight_class_1 = total_samples / 1050
# weight_class_2 = total_samples / 3437
# weight_class_3 = total_samples / 751
# weight_class_4 = total_samples / 3749

# for classifier use Logloss, CrossEntropy, MultiClass, MultiClassOneVsAll
# Create CatBoost classifier
catboost_clf = CatBoostClassifier(iterations=1000,
                                   depth=4, 
                                   learning_rate=0.05,
#                                    l2_leaf_reg=1,
                                   loss_function='MultiClass',
#                                    loss_function='Logloss',
                                   eval_metric='Accuracy',
                                   cat_features=['MaMH', 'Major', 'GVLT', 'GVTT'],  # Specify high cardinality categorical features
                                   auto_class_weights='Balanced',  # Automatically balance class weights
#                                    class_weights=[weight_class_1, weight_class_2, weight_class_3, weight_class_4],  # Manually adjust class weights
                                   random_seed=42)

# Fit the model
catboost_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = catboost_clf.predict(X_test)

accuracy = catboost_clf.score(X_test, y_test)
print("Model Accuracy:", accuracy)

#-------------Cross Validate-------------------------------------------------------------

# # Create CatBoost classifier
# catboost_clf = CatBoostClassifier(iterations=500,
#                                    depth=6, 
#                                    learning_rate=0.2,
#                                    loss_function='Logloss', 
#                                    eval_metric='Accuracy',
#                                    cat_features=['MaMH', 'Major', 'GVLT', 'GVTT'],  
#                                    random_seed=42)

# # Perform cross-validation
# cv_scores = cross_val_score(catboost_clf, X_train, y_train, cv=5, scoring='accuracy')  # You can adjust cv (number of folds) as needed

# # Print cross-validation scores
# print("Cross-Validation Scores:", cv_scores)
# print("Mean Accuracy:", np.mean(cv_scores))

#===========================================================================================================
#------------------SMOTE-N----------------------------------------------------------------------------------
#===========================================================================================================

# # Step 1: Split the dataset into features (X) and target variable (y)
# X = df.drop(columns=['ClassifyAs'])  # Features
# y = df['ClassifyAs']  # Target

# # Step 2: Split the features and target into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Step 3: Apply SMOTE-N to balance the training data
# categorical_indices = [index for index, column in enumerate(X_train.columns) if column in ['MaMH', 'Major', 'GVLT', 'GVTT']]
# smote_nc = SMOTENC(categorical_features=categorical_indices, random_state=42)
# X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)

# # Step 4: Train the model on the balanced training data
# catboost_clf = CatBoostClassifier(iterations=500,
#                                    depth=6, 
#                                    learning_rate=0.2,
# #                                    loss_function='MultiClass',
#                                    loss_function='Logloss',
#                                    grow_policy='Lossguide',
#                                    eval_metric='Accuracy',
#                                    cat_features=['MaMH', 'Major', 'GVLT', 'GVTT'],  
#                                    random_seed=42)
# catboost_clf.fit(X_resampled, y_resampled)

# # Step 5: Predict on the testing data
# y_pred = catboost_clf.predict(X_test)

# # Step 6: Evaluate the model
# accuracy = catboost_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

#===========================================================================================================
#------------------Oversampling Minor Class (2 Classes)-----------------------------------------------------
#===========================================================================================================

# # Split the data into features (X) and target (y)
# X = df.drop(columns=['ClassifyAs'])  # Features
# y = df['ClassifyAs']  # Target

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Combine the features and target into a single DataFrame for the training data
# train_data = pd.concat([X_train, y_train], axis=1)

# # Separate majority and minority classes
# majority_class = train_data[train_data['ClassifyAs'] == 'Class2'] 
# minority_class = train_data[train_data['ClassifyAs'] == 'Class1']

# # Upsample minority class
# minority_upsampled = resample(minority_class,
#                               replace=True,     # Sample with replacement
#                               n_samples=len(majority_class),    # Match number of samples in majority class
#                               random_state=42)  # Set random state for reproducibility

# # Combine majority class and upsampled minority class
# train_upsampled = pd.concat([majority_class, minority_upsampled])

# # Separate features and target after oversampling
# X_train_resampled = train_upsampled.drop(columns=['ClassifyAs'])
# y_train_resampled = train_upsampled['ClassifyAs']

# # Create CatBoost classifier with balanced class weights
# catboost_clf = CatBoostClassifier(iterations=5000,
#                                    depth=6, 
#                                    learning_rate=0.1,
#                                    loss_function='Logloss',
#                                    eval_metric='Accuracy',
#                                    cat_features=['MaMH', 'Major', 'GVLT', 'GVTT'],  # Specify high cardinality categorical features
#                                    auto_class_weights='Balanced',  # Automatically balance class weights
#                                    random_seed=42)

# # Fit the model on the resampled data
# catboost_clf.fit(X_train_resampled, y_train_resampled)

# # Predict on the test set
# y_pred = catboost_clf.predict(X_test)

# # Evaluate model accuracy
# accuracy = catboost_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

#===========================================================================================================
#------------------Oversampling Minor Class (4 Classes)-----------------------------------------------------
#===========================================================================================================

# # Split the data into features (X) and target (y)
# X = df.drop(columns=['ClassifyAs'])  # Features
# y = df['ClassifyAs']  # Target

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)

# # Combine the features and target into a single DataFrame for the training data
# train_data = pd.concat([X_train, y_train], axis=1)

# # Separate majority and minority classes
# majority_class = train_data[train_data['ClassifyAs'] == 'Class4']  # Assuming 'Class4' is the majority class
# minority_classes = [train_data[train_data['ClassifyAs'] == cls] for cls in ['Class1', 'Class2', 'Class3']]

# # Upsample each minority class individually
# minority_upsampled = pd.concat([resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)
#                                 for minority_class in minority_classes])

# # Combine majority class and upsampled minority classes
# train_upsampled = pd.concat([majority_class] + [minority_upsampled])

# # Separate features and target after oversampling
# X_train_resampled = train_upsampled.drop(columns=['ClassifyAs'])
# y_train_resampled = train_upsampled['ClassifyAs']

# # Create CatBoost classifier with balanced class weights
# catboost_clf = CatBoostClassifier(iterations=5000,
#                                   depth=6,
#                                   learning_rate=0.1,
#                                   loss_function='MultiClass',
#                                   eval_metric='Accuracy',
#                                   cat_features=['MaMH', 'Major', 'GVLT', 'GVTT'],  # Specify high cardinality categorical features
#                                   auto_class_weights='Balanced',  # Automatically balance class weights
#                                   random_seed=42)

# # Fit the model on the resampled data
# catboost_clf.fit(X_train_resampled, y_train_resampled)

# # Predict on the test set
# y_pred = catboost_clf.predict(X_test)

# # Evaluate model accuracy
# accuracy = catboost_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

0:	learn: 0.6687136	total: 12.1ms	remaining: 12.1s
1:	learn: 0.6685017	total: 25.8ms	remaining: 12.9s
2:	learn: 0.6680610	total: 38.4ms	remaining: 12.8s
3:	learn: 0.6671392	total: 52.5ms	remaining: 13.1s
4:	learn: 0.6716640	total: 65ms	remaining: 12.9s
5:	learn: 0.6718170	total: 78.2ms	remaining: 12.9s
6:	learn: 0.6724092	total: 93.2ms	remaining: 13.2s
7:	learn: 0.6725050	total: 109ms	remaining: 13.5s
8:	learn: 0.6734823	total: 119ms	remaining: 13.1s
9:	learn: 0.6746514	total: 132ms	remaining: 13s
10:	learn: 0.6750534	total: 145ms	remaining: 13s
11:	learn: 0.6739802	total: 158ms	remaining: 13s
12:	learn: 0.6739601	total: 171ms	remaining: 13s
13:	learn: 0.6734436	total: 184ms	remaining: 12.9s
14:	learn: 0.6743049	total: 197ms	remaining: 12.9s
15:	learn: 0.6740559	total: 210ms	remaining: 12.9s
16:	learn: 0.6736152	total: 223ms	remaining: 12.9s
17:	learn: 0.6740559	total: 238ms	remaining: 13s
18:	learn: 0.6745925	total: 253ms	remaining: 13.1s
19:	learn: 0.6751106	total: 267ms	remaining: 1

164:	learn: 0.7113353	total: 2.43s	remaining: 12.3s
165:	learn: 0.7112395	total: 2.44s	remaining: 12.3s
166:	learn: 0.7102050	total: 2.46s	remaining: 12.3s
167:	learn: 0.7105885	total: 2.47s	remaining: 12.2s
168:	learn: 0.7104926	total: 2.49s	remaining: 12.2s
169:	learn: 0.7107802	total: 2.5s	remaining: 12.2s
170:	learn: 0.7105885	total: 2.51s	remaining: 12.2s
171:	learn: 0.7105885	total: 2.53s	remaining: 12.2s
172:	learn: 0.7116045	total: 2.54s	remaining: 12.1s
173:	learn: 0.7119493	total: 2.55s	remaining: 12.1s
174:	learn: 0.7122369	total: 2.56s	remaining: 12.1s
175:	learn: 0.7125817	total: 2.58s	remaining: 12.1s
176:	learn: 0.7125817	total: 2.59s	remaining: 12s
177:	learn: 0.7124859	total: 2.6s	remaining: 12s
178:	learn: 0.7126776	total: 2.62s	remaining: 12s
179:	learn: 0.7121410	total: 2.63s	remaining: 12s
180:	learn: 0.7124287	total: 2.64s	remaining: 12s
181:	learn: 0.7125245	total: 2.66s	remaining: 11.9s
182:	learn: 0.7124859	total: 2.67s	remaining: 11.9s
183:	learn: 0.7123143	to

333:	learn: 0.7271114	total: 4.73s	remaining: 9.43s
334:	learn: 0.7280887	total: 4.75s	remaining: 9.42s
335:	learn: 0.7288743	total: 4.76s	remaining: 9.4s
336:	learn: 0.7290088	total: 4.77s	remaining: 9.39s
337:	learn: 0.7289903	total: 4.79s	remaining: 9.37s
338:	learn: 0.7291821	total: 4.8s	remaining: 9.36s
339:	learn: 0.7298717	total: 4.81s	remaining: 9.34s
340:	learn: 0.7295269	total: 4.83s	remaining: 9.33s
341:	learn: 0.7301594	total: 4.84s	remaining: 9.31s
342:	learn: 0.7300635	total: 4.85s	remaining: 9.3s
343:	learn: 0.7302552	total: 4.87s	remaining: 9.28s
344:	learn: 0.7305429	total: 4.88s	remaining: 9.27s
345:	learn: 0.7302939	total: 4.89s	remaining: 9.25s
346:	learn: 0.7307346	total: 4.91s	remaining: 9.23s
347:	learn: 0.7305429	total: 4.92s	remaining: 9.21s
348:	learn: 0.7305243	total: 4.93s	remaining: 9.2s
349:	learn: 0.7310609	total: 4.94s	remaining: 9.18s
350:	learn: 0.7307733	total: 4.96s	remaining: 9.17s
351:	learn: 0.7296429	total: 4.97s	remaining: 9.16s
352:	learn: 0.72

492:	learn: 0.7446520	total: 6.89s	remaining: 7.09s
493:	learn: 0.7439624	total: 6.92s	remaining: 7.08s
494:	learn: 0.7443072	total: 6.93s	remaining: 7.07s
495:	learn: 0.7443072	total: 6.95s	remaining: 7.06s
496:	learn: 0.7443072	total: 6.96s	remaining: 7.04s
497:	learn: 0.7451314	total: 6.97s	remaining: 7.03s
498:	learn: 0.7449397	total: 6.99s	remaining: 7.01s
499:	learn: 0.7447866	total: 7s	remaining: 7s
500:	learn: 0.7446907	total: 7.01s	remaining: 6.99s
501:	learn: 0.7446907	total: 7.03s	remaining: 6.97s
502:	learn: 0.7449397	total: 7.04s	remaining: 6.96s
503:	learn: 0.7445948	total: 7.06s	remaining: 6.94s
504:	learn: 0.7453232	total: 7.07s	remaining: 6.93s
505:	learn: 0.7455721	total: 7.08s	remaining: 6.92s
506:	learn: 0.7449397	total: 7.1s	remaining: 6.9s
507:	learn: 0.7448438	total: 7.11s	remaining: 6.89s
508:	learn: 0.7458026	total: 7.13s	remaining: 6.87s
509:	learn: 0.7456108	total: 7.14s	remaining: 6.86s
510:	learn: 0.7451701	total: 7.15s	remaining: 6.84s
511:	learn: 0.745802

655:	learn: 0.7563256	total: 9.04s	remaining: 4.74s
656:	learn: 0.7567663	total: 9.06s	remaining: 4.73s
657:	learn: 0.7572070	total: 9.07s	remaining: 4.71s
658:	learn: 0.7572070	total: 9.08s	remaining: 4.7s
659:	learn: 0.7573029	total: 9.09s	remaining: 4.68s
660:	learn: 0.7577436	total: 9.11s	remaining: 4.67s
661:	learn: 0.7577436	total: 9.12s	remaining: 4.66s
662:	learn: 0.7572070	total: 9.13s	remaining: 4.64s
663:	learn: 0.7573987	total: 9.14s	remaining: 4.63s
664:	learn: 0.7572457	total: 9.16s	remaining: 4.61s
665:	learn: 0.7578395	total: 9.17s	remaining: 4.6s
666:	learn: 0.7575905	total: 9.18s	remaining: 4.58s
667:	learn: 0.7570539	total: 9.2s	remaining: 4.57s
668:	learn: 0.7570539	total: 9.21s	remaining: 4.55s
669:	learn: 0.7571498	total: 9.22s	remaining: 4.54s
670:	learn: 0.7574374	total: 9.23s	remaining: 4.53s
671:	learn: 0.7571498	total: 9.25s	remaining: 4.51s
672:	learn: 0.7572457	total: 9.26s	remaining: 4.5s
673:	learn: 0.7573416	total: 9.27s	remaining: 4.49s
674:	learn: 0.75

820:	learn: 0.7660816	total: 11.2s	remaining: 2.44s
821:	learn: 0.7664264	total: 11.2s	remaining: 2.42s
822:	learn: 0.7665223	total: 11.2s	remaining: 2.41s
823:	learn: 0.7660816	total: 11.2s	remaining: 2.4s
824:	learn: 0.7661775	total: 11.2s	remaining: 2.38s
825:	learn: 0.7660816	total: 11.2s	remaining: 2.37s
826:	learn: 0.7665223	total: 11.3s	remaining: 2.35s
827:	learn: 0.7661775	total: 11.3s	remaining: 2.34s
828:	learn: 0.7660816	total: 11.3s	remaining: 2.33s
829:	learn: 0.7666182	total: 11.3s	remaining: 2.31s
830:	learn: 0.7667141	total: 11.3s	remaining: 2.3s
831:	learn: 0.7668671	total: 11.3s	remaining: 2.29s
832:	learn: 0.7669630	total: 11.3s	remaining: 2.27s
833:	learn: 0.7670589	total: 11.3s	remaining: 2.26s
834:	learn: 0.7669630	total: 11.4s	remaining: 2.25s
835:	learn: 0.7669630	total: 11.4s	remaining: 2.23s
836:	learn: 0.7680362	total: 11.4s	remaining: 2.22s
837:	learn: 0.7676913	total: 11.4s	remaining: 2.2s
838:	learn: 0.7677872	total: 11.4s	remaining: 2.19s
839:	learn: 0.7

989:	learn: 0.7718326	total: 13.5s	remaining: 136ms
990:	learn: 0.7725222	total: 13.5s	remaining: 123ms
991:	learn: 0.7725222	total: 13.5s	remaining: 109ms
992:	learn: 0.7719857	total: 13.6s	remaining: 95.5ms
993:	learn: 0.7727140	total: 13.6s	remaining: 81.9ms
994:	learn: 0.7727140	total: 13.6s	remaining: 68.2ms
995:	learn: 0.7727140	total: 13.6s	remaining: 54.6ms
996:	learn: 0.7728099	total: 13.6s	remaining: 40.9ms
997:	learn: 0.7728099	total: 13.6s	remaining: 27.3ms
998:	learn: 0.7728099	total: 13.6s	remaining: 13.6ms
999:	learn: 0.7729057	total: 13.6s	remaining: 0us
Model Accuracy: 0.6868626274745051


In [142]:
X_test.head(5)

Unnamed: 0,MaMH,Major,SoTinChi,SiSoTKB,TKB_LT2,TKB_LT3,TKB_LT4,TKB_LT5,TKB_LT6,TKB_LT7,TKB_TH2,TKB_TH3,TKB_TH4,TKB_TH5,TKB_TH6,TKB_TH7,HK,GVLT,GVTT
7366,EN012IU,IU EN,2,35.0,9,0,0,0,0,0,0,0,0,0,0,0,1,Vu Tien Thinh,unassigned
4051,IS086IU,IEMIEM,3,60.0,7,0,0,0,0,0,0,0,0,0,0,0,1,Ton That Long,unassigned
3049,ENTP02-1,IU IAC,13,35.0,7,7,0,0,7,1,0,0,0,0,0,0,1,multi,unassigned
4914,BA169IU,BA BA,3,60.0,1,0,1,0,0,0,0,0,0,0,0,0,3,Lai Vinh Phuc,unassigned
5353,BA130IU,BA BA,3,60.0,0,0,4,0,0,0,0,0,0,0,0,0,2,Pham Thanh Huyen,unassigned


In [143]:
X_test.dtypes

MaMH         object
Major        object
SoTinChi      int64
SiSoTKB     float64
TKB_LT2       int64
TKB_LT3       int64
TKB_LT4       int64
TKB_LT5       int64
TKB_LT6       int64
TKB_LT7       int64
TKB_TH2       int64
TKB_TH3       int64
TKB_TH4       int64
TKB_TH5       int64
TKB_TH6       int64
TKB_TH7       int64
HK            int64
GVLT         object
GVTT         object
dtype: object

In [144]:
X_test.shape

(1667, 19)

In [145]:
# # Define the parameter grid
# param_grid = {
#     'iterations': [500, 1000],  # Number of boosting iterations
#     'depth': [4, 6, 8],          # Maximum depth of each tree
#     'learning_rate': [0.05, 0.1],# Learning rate
#     # Add more parameters to tune if needed
# }

# # Create CatBoost classifier
# catboost_clf = CatBoostClassifier(loss_function='Logloss',
#                                   eval_metric='Accuracy',
#                                   cat_features=['MaMH', 'Major', 'GVLT', 'GVTT'],
#                                   auto_class_weights='Balanced',
#                                   random_seed=42)

# # Perform grid search
# grid_search = GridSearchCV(estimator=catboost_clf, param_grid=param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train, y_train)

# # Best parameters found
# print("Best parameters:", grid_search.best_params_)

# # Best score
# print("Best accuracy:", grid_search.best_score_)

# # Predict on the test set using the best model
# best_clf = grid_search.best_estimator_
# y_pred = best_clf.predict(X_test)

# # Calculate accuracy
# accuracy = best_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

In [146]:
# # Convert y_resampled to a pandas Series
y_train_series = pd.Series(y_train)

# Count the occurrences of each class
class_counts2 = y_train_series.value_counts()

# Print the class counts
print("Class Counts in y_train_series:")
print(class_counts2)

Class Counts in y_train_series:
ClassifyAs
Class2    5215
Class1    1450
Name: count, dtype: int64


In [147]:
# # Convert y_resampled to a pandas Series
# # y_resampled_series = pd.Series(y_resampled)
# y_resampled_series = pd.Series(y_train_resampled)

# # Count the occurrences of each class
# class_counts = y_resampled_series.value_counts()

# # Print the class counts
# print("Class Counts in y_resampled:")
# print(class_counts)

In [148]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Create a DataFrame for actual vs. predicted values
results_df = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten()})
results_df

Accuracy: 0.6868626274745051


Unnamed: 0,Actual,Predicted
0,Class2,Class2
1,Class2,Class1
2,Class2,Class2
3,Class2,Class1
4,Class2,Class1
5,Class1,Class2
6,Class2,Class1
7,Class2,Class1
8,Class2,Class2
9,Class2,Class1


In [149]:
# Create a DataFrame to count total correct and incorrect predictions for each class
class_counts = {}

# Update counts based on actual vs. predicted values
for actual, predicted in zip(y_test, y_pred):
    if actual not in class_counts:
        class_counts[actual] = {'Correct': 0, 'Incorrect': 0}
    if actual == predicted:
        class_counts[actual]['Correct'] += 1
    else:
        class_counts[actual]['Incorrect'] += 1

# Convert dictionary to DataFrame
class_count_df = pd.DataFrame(class_counts).transpose().reset_index()
class_count_df.rename(columns={'index': 'Class'}, inplace=True)

# Calculate total correct and incorrect predictions across all classes
total_correct_all_classes = class_count_df['Correct'].sum()
total_incorrect_all_classes = class_count_df['Incorrect'].sum()

# Add a row for the total counts across all classes
total_row = pd.DataFrame({'Class': 'Total', 'Correct': total_correct_all_classes, 'Incorrect': total_incorrect_all_classes}, index=[0])
class_count_df = pd.concat([class_count_df, total_row], ignore_index=True)

# Display the DataFrame
print("\nClass-wise Correct and Incorrect Predictions:")
print(class_count_df)


Class-wise Correct and Incorrect Predictions:
    Class  Correct  Incorrect
0  Class2      887        451
1  Class1      258         71
2   Total     1145        522


In [150]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# # Classification Report
# print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[258  71]
 [451 887]]
              precision    recall  f1-score   support

      Class1       0.36      0.78      0.50       329
      Class2       0.93      0.66      0.77      1338

    accuracy                           0.69      1667
   macro avg       0.64      0.72      0.63      1667
weighted avg       0.81      0.69      0.72      1667



In [151]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.6868626274745051
Precision: 0.8149717421589252
Recall: 0.6868626274745051
F1-score: 0.7182677077603352


In [152]:
# Define the filename to save your model
# model_filename = 'dhqt_90slots_classifier_model.cbm'
model_filename = 'dhqt_tr89_classifier_model.cbm'

# Save the model
catboost_clf.save_model(model_filename)

print("Model saved successfully as", model_filename)

Model saved successfully as dhqt_tr89_classifier_model.cbm
