In [1]:
# !python -m pip uninstall scikit-learn --yes

In [2]:
# pip install scikit-learn==1.2.2 --user

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [4]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [5]:
# import sklearn
# print(sklearn.__version__)

In [6]:
from imblearn.over_sampling import SMOTENC
from sklearn.utils import resample

In [7]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [8]:
file_path = './data/data_iu_proccessed_p3.csv'
# file_path = './data/data_iu_proccessed_p3_v2.csv'

# Read the CSV file with specified data types
df = pd.read_csv(file_path)

In [9]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,TenMH,MaMH,Major,SoTinChi,SiSoTKB,ClassifyAs,TKB_LT2,TKB_LT3,TKB_LT4,...,TKB_LT7,TKB_TH2,TKB_TH3,TKB_TH4,TKB_TH5,TKB_TH6,TKB_TH7,HK,GVLT,GVTH
0,0,Kinh tế hành vi và chính sách công,PM5110,BA BA,2,50.0,4,0,0,0,...,0,0,0,0,0,0,0,2,Nguyen Van Phuong,unassigned
1,1,Kinh tế hành vi và chính sách công,PM5110,BA BA,2,61.0,2,0,0,0,...,0,0,0,0,0,0,0,2,Nguyen Van Phuong,unassigned
2,2,Kinh tế hành vi và chính sách công,PM5110,BA BA,2,14.0,1,0,0,0,...,7,0,0,0,0,0,0,3,Nguyen Van Phuong,unassigned
3,3,Kinh tế lượng và phân tích chính sách,PM5214,BA BA,2,50.0,4,0,0,0,...,0,0,0,0,0,0,0,2,Nguyen Phuong Anh,unassigned
4,4,Kỹ năng lãnh đạo trong khu vực công,PM5212,BA BA,2,41.0,2,0,0,0,...,7,0,0,0,0,0,0,1,Trinh Viet Dung,unassigned


In [10]:
# Remove the specified columns
columns_to_remove = ['Unnamed: 0', 'TenMH']  # Columns to be removed
df.drop(columns=columns_to_remove, inplace=True)
df.dtypes

MaMH           object
Major          object
SoTinChi        int64
SiSoTKB       float64
ClassifyAs      int64
TKB_LT2         int64
TKB_LT3         int64
TKB_LT4         int64
TKB_LT5         int64
TKB_LT6         int64
TKB_LT7         int64
TKB_TH2         int64
TKB_TH3         int64
TKB_TH4         int64
TKB_TH5         int64
TKB_TH6         int64
TKB_TH7         int64
HK              int64
GVLT           object
GVTH           object
dtype: object

In [11]:
df.shape

(8341, 20)

In [12]:
#-----------4 Classes--------------
# #Create a dictionary to map values
# class_mapping = {1: 'Class1', 2: 'Class2', 3: 'Class3', 4: 'Class4'}

# # Map the values in the 'ClassifyAs' column
# df['ClassifyAs'] = df['ClassifyAs'].map(class_mapping)

# # #-----------2 Classes (Over90)--------------
# # Create a dictionary to map values
# class_mapping = {1: 'Class1', 2: 'Class1', 3: 'Class2', 4: 'Class2'}

# # Map the values in the 'ClassifyAs' column
# df['ClassifyAs'] = df['ClassifyAs'].map(class_mapping)

#---------- 2 Classes (TR89-Spread)-------------
# # Create a dictionary to map values
# class_mapping = {1: 'Class1', 2: 'Class2', 3: 'Class1', 4: 'Class2'}

# # Map the values in the 'ClassifyAs' column
# df['ClassifyAs'] = df['ClassifyAs'].map(class_mapping)

# #-----------2 Classes (High vs Low priority course)--------------
# Create a dictionary to map values
class_mapping = {1: 'Class1', 2: 'Class2', 3: 'Class2', 4: 'Class2'}

# Map the values in the 'ClassifyAs' column
df['ClassifyAs'] = df['ClassifyAs'].map(class_mapping)

In [13]:
# Count the occurrences of each class
class_counts = df['ClassifyAs'].value_counts()

# Print the counts
print(class_counts)

ClassifyAs
Class2    7297
Class1    1044
Name: count, dtype: int64


In [14]:
#===========================================================================================================
#-------------------------Normal / Class Weight-------------------------------------------------------------
#===========================================================================================================

# Split the data into features (X) and target (y)
X = df.drop(columns=['ClassifyAs'])  # Features
y = df['ClassifyAs']  # Target

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define custom class weights
# Total number of samples
# total_samples = 1050 + 3437 + 751 + 3749

# # Calculate class weights
# weight_class_1 = total_samples / 1050
# weight_class_2 = total_samples / 3437
# weight_class_3 = total_samples / 751
# weight_class_4 = total_samples / 3749

# for classifier use Logloss, CrossEntropy, MultiClass, MultiClassOneVsAll
# Create CatBoost classifier
catboost_clf = CatBoostClassifier(iterations=1000,
                                   depth=4, 
                                   learning_rate=0.2,
#                                    l2_leaf_reg=1,
                                   loss_function='MultiClass',
#                                    loss_function='Logloss',
                                   eval_metric='Accuracy',
                                   cat_features=['MaMH', 'Major', 'GVLT', 'GVTH'],  # Specify high cardinality categorical features
                                   auto_class_weights='Balanced',  # Automatically balance class weights
#                                    class_weights=[weight_class_1, weight_class_2, weight_class_3, weight_class_4],  # Manually adjust class weights
                                   random_seed=42)

# Fit the model
catboost_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = catboost_clf.predict(X_test)

accuracy = catboost_clf.score(X_test, y_test)
print("Model Accuracy:", accuracy)

# #-------------Cross Validate-------------------------------------------------------------

# # Create CatBoost classifier
# catboost_clf = CatBoostClassifier(iterations=500,
#                                    depth=6, 
#                                    learning_rate=0.2,
#                                    loss_function='Logloss', 
#                                    eval_metric='Accuracy',
#                                    cat_features=['MaMH', 'Major', 'GVLT', 'GVTH'],  
#                                    random_seed=42)

# # Perform cross-validation
# cv_scores = cross_val_score(catboost_clf, X_train, y_train, cv=5, scoring='accuracy')  # You can adjust cv (number of folds) as needed

# # Print cross-validation scores
# print("Cross-Validation Scores:", cv_scores)
# print("Mean Accuracy:", np.mean(cv_scores))

#===========================================================================================================
#------------------SMOTE-N----------------------------------------------------------------------------------
#===========================================================================================================

# # Step 1: Split the dataset into features (X) and target variable (y)
# X = df.drop(columns=['ClassifyAs'])  # Features
# y = df['ClassifyAs']  # Target

# # Step 2: Split the features and target into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Step 3: Apply SMOTE-N to balance the training data
# categorical_indices = [index for index, column in enumerate(X_train.columns) if column in ['MaMH', 'Major', 'GVLT', 'GVTH']]
# smote_nc = SMOTENC(categorical_features=categorical_indices, random_state=42)
# X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)

# # Step 4: Train the model on the balanced training data
# catboost_clf = CatBoostClassifier(iterations=500,
#                                    depth=6, 
#                                    learning_rate=0.2,
# #                                    loss_function='MultiClass',
#                                    loss_function='Logloss',
#                                    grow_policy='Lossguide',
#                                    eval_metric='Accuracy',
#                                    cat_features=['MaMH', 'Major', 'GVLT', 'GVTH'],  
#                                    random_seed=42)
# catboost_clf.fit(X_resampled, y_resampled)

# # Step 5: Predict on the testing data
# y_pred = catboost_clf.predict(X_test)

# # Step 6: Evaluate the model
# accuracy = catboost_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

#===========================================================================================================
#------------------Oversampling Minor Class (2 Classes)-----------------------------------------------------
#===========================================================================================================

# # Split the data into features (X) and target (y)
# X = df.drop(columns=['ClassifyAs'])  # Features
# y = df['ClassifyAs']  # Target

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Combine the features and target into a single DataFrame for the training data
# train_data = pd.concat([X_train, y_train], axis=1)

# # Separate majority and minority classes
# majority_class = train_data[train_data['ClassifyAs'] == 'Class2'] 
# minority_class = train_data[train_data['ClassifyAs'] == 'Class1']

# # Upsample minority class
# minority_upsampled = resample(minority_class,
#                               replace=True,     # Sample with replacement
#                               n_samples=len(majority_class),    # Match number of samples in majority class
#                               random_state=42)  # Set random state for reproducibility

# # Combine majority class and upsampled minority class
# train_upsampled = pd.concat([majority_class, minority_upsampled])

# # Separate features and target after oversampling
# X_train_resampled = train_upsampled.drop(columns=['ClassifyAs'])
# y_train_resampled = train_upsampled['ClassifyAs']

# # Create CatBoost classifier with balanced class weights
# catboost_clf = CatBoostClassifier(iterations=5000,
#                                    depth=6, 
#                                    learning_rate=0.1,
#                                    loss_function='Logloss',
#                                    eval_metric='Accuracy',
#                                    cat_features=['MaMH', 'Major', 'GVLT', 'GVTH'],  # Specify high cardinality categorical features
#                                    auto_class_weights='Balanced',  # Automatically balance class weights
#                                    random_seed=42)

# # Fit the model on the resampled data
# catboost_clf.fit(X_train_resampled, y_train_resampled)

# # Predict on the test set
# y_pred = catboost_clf.predict(X_test)

# # Evaluate model accuracy
# accuracy = catboost_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

#===========================================================================================================
#------------------Oversampling Minor Class (4 Classes)-----------------------------------------------------
#===========================================================================================================

# # Split the data into features (X) and target (y)
# X = df.drop(columns=['ClassifyAs'])  # Features
# y = df['ClassifyAs']  # Target

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)

# # Combine the features and target into a single DataFrame for the training data
# train_data = pd.concat([X_train, y_train], axis=1)

# # Separate majority and minority classes
# majority_class = train_data[train_data['ClassifyAs'] == 'Class4']  # Assuming 'Class4' is the majority class
# minority_classes = [train_data[train_data['ClassifyAs'] == cls] for cls in ['Class1', 'Class2', 'Class3']]

# # Upsample each minority class individually
# minority_upsampled = pd.concat([resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)
#                                 for minority_class in minority_classes])

# # Combine majority class and upsampled minority classes
# train_upsampled = pd.concat([majority_class] + [minority_upsampled])

# # Separate features and target after oversampling
# X_train_resampled = train_upsampled.drop(columns=['ClassifyAs'])
# y_train_resampled = train_upsampled['ClassifyAs']

# # Create CatBoost classifier with balanced class weights
# catboost_clf = CatBoostClassifier(iterations=5000,
#                                   depth=6,
#                                   learning_rate=0.1,
#                                   loss_function='MultiClass',
#                                   eval_metric='Accuracy',
#                                   cat_features=['MaMH', 'Major', 'GVLT', 'GVTH'],  # Specify high cardinality categorical features
#                                   auto_class_weights='Balanced',  # Automatically balance class weights
#                                   random_seed=42)

# # Fit the model on the resampled data
# catboost_clf.fit(X_train_resampled, y_train_resampled)

# # Predict on the test set
# y_pred = catboost_clf.predict(X_test)

# # Evaluate model accuracy
# accuracy = catboost_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

0:	learn: 0.6278638	total: 156ms	remaining: 2m 36s
1:	learn: 0.6359332	total: 170ms	remaining: 1m 24s
2:	learn: 0.6872352	total: 182ms	remaining: 1m
3:	learn: 0.6872656	total: 195ms	remaining: 48.6s
4:	learn: 0.6836099	total: 209ms	remaining: 41.5s
5:	learn: 0.6876017	total: 221ms	remaining: 36.6s
6:	learn: 0.6889746	total: 232ms	remaining: 32.9s
7:	learn: 0.6935903	total: 244ms	remaining: 30.2s
8:	learn: 0.6926911	total: 256ms	remaining: 28.2s
9:	learn: 0.6966651	total: 268ms	remaining: 26.6s
10:	learn: 0.6960734	total: 280ms	remaining: 25.1s
11:	learn: 0.6990677	total: 292ms	remaining: 24.1s
12:	learn: 0.7004174	total: 305ms	remaining: 23.1s
13:	learn: 0.7003673	total: 316ms	remaining: 22.3s
14:	learn: 0.7016902	total: 332ms	remaining: 21.8s
15:	learn: 0.6999096	total: 361ms	remaining: 22.2s
16:	learn: 0.7012736	total: 389ms	remaining: 22.5s
17:	learn: 0.7022175	total: 427ms	remaining: 23.3s
18:	learn: 0.7052351	total: 453ms	remaining: 23.4s
19:	learn: 0.7055282	total: 489ms	remainin

164:	learn: 0.7851076	total: 2.8s	remaining: 14.2s
165:	learn: 0.7859567	total: 2.83s	remaining: 14.2s
166:	learn: 0.7862141	total: 2.86s	remaining: 14.3s
167:	learn: 0.7862141	total: 2.88s	remaining: 14.2s
168:	learn: 0.7868148	total: 2.89s	remaining: 14.2s
169:	learn: 0.7873207	total: 2.91s	remaining: 14.2s
170:	learn: 0.7879214	total: 2.93s	remaining: 14.2s
171:	learn: 0.7863178	total: 2.94s	remaining: 14.2s
172:	learn: 0.7856403	total: 2.96s	remaining: 14.1s
173:	learn: 0.7866611	total: 2.97s	remaining: 14.1s
174:	learn: 0.7891137	total: 2.99s	remaining: 14.1s
175:	learn: 0.7891995	total: 3s	remaining: 14.1s
176:	learn: 0.7905546	total: 3.02s	remaining: 14s
177:	learn: 0.7893711	total: 3.03s	remaining: 14s
178:	learn: 0.7894748	total: 3.05s	remaining: 14s
179:	learn: 0.7897322	total: 3.07s	remaining: 14s
180:	learn: 0.7906583	total: 3.09s	remaining: 14s
181:	learn: 0.7907441	total: 3.11s	remaining: 14s
182:	learn: 0.7904098	total: 3.13s	remaining: 14s
183:	learn: 0.7908388	total: 3

328:	learn: 0.8257643	total: 5.44s	remaining: 11.1s
329:	learn: 0.8244950	total: 5.45s	remaining: 11.1s
330:	learn: 0.8247435	total: 5.47s	remaining: 11.1s
331:	learn: 0.8256016	total: 5.49s	remaining: 11.1s
332:	learn: 0.8250099	total: 5.51s	remaining: 11s
333:	learn: 0.8268708	total: 5.54s	remaining: 11s
334:	learn: 0.8269566	total: 5.55s	remaining: 11s
335:	learn: 0.8280632	total: 5.57s	remaining: 11s
336:	learn: 0.8276431	total: 5.59s	remaining: 11s
337:	learn: 0.8280632	total: 5.62s	remaining: 11s
338:	learn: 0.8285691	total: 5.65s	remaining: 11s
339:	learn: 0.8278916	total: 5.67s	remaining: 11s
340:	learn: 0.8289981	total: 5.68s	remaining: 11s
341:	learn: 0.8283206	total: 5.7s	remaining: 11s
342:	learn: 0.8282348	total: 5.71s	remaining: 10.9s
343:	learn: 0.8275573	total: 5.72s	remaining: 10.9s
344:	learn: 0.8277200	total: 5.74s	remaining: 10.9s
345:	learn: 0.8267082	total: 5.75s	remaining: 10.9s
346:	learn: 0.8278916	total: 5.76s	remaining: 10.8s
347:	learn: 0.8272141	total: 5.78

495:	learn: 0.8566032	total: 8.46s	remaining: 8.59s
496:	learn: 0.8560973	total: 8.48s	remaining: 8.58s
497:	learn: 0.8561831	total: 8.49s	remaining: 8.56s
498:	learn: 0.8566979	total: 8.52s	remaining: 8.55s
499:	learn: 0.8568695	total: 8.53s	remaining: 8.53s
500:	learn: 0.8565352	total: 8.55s	remaining: 8.52s
501:	learn: 0.8570411	total: 8.57s	remaining: 8.5s
502:	learn: 0.8572127	total: 8.59s	remaining: 8.48s
503:	learn: 0.8559435	total: 8.6s	remaining: 8.46s
504:	learn: 0.8558577	total: 8.62s	remaining: 8.45s
505:	learn: 0.8564584	total: 8.63s	remaining: 8.43s
506:	learn: 0.8585767	total: 8.65s	remaining: 8.41s
507:	learn: 0.8590826	total: 8.66s	remaining: 8.39s
508:	learn: 0.8582335	total: 8.68s	remaining: 8.37s
509:	learn: 0.8575560	total: 8.69s	remaining: 8.35s
510:	learn: 0.8578992	total: 8.71s	remaining: 8.33s
511:	learn: 0.8574702	total: 8.72s	remaining: 8.31s
512:	learn: 0.8589968	total: 8.74s	remaining: 8.29s
513:	learn: 0.8590826	total: 8.75s	remaining: 8.28s
514:	learn: 0.

658:	learn: 0.8752556	total: 11.1s	remaining: 5.74s
659:	learn: 0.8744064	total: 11.1s	remaining: 5.72s
660:	learn: 0.8743206	total: 11.1s	remaining: 5.71s
661:	learn: 0.8742348	total: 11.1s	remaining: 5.69s
662:	learn: 0.8744064	total: 11.2s	remaining: 5.67s
663:	learn: 0.8744064	total: 11.2s	remaining: 5.65s
664:	learn: 0.8749123	total: 11.2s	remaining: 5.63s
665:	learn: 0.8753414	total: 11.2s	remaining: 5.61s
666:	learn: 0.8755130	total: 11.2s	remaining: 5.59s
667:	learn: 0.8762763	total: 11.2s	remaining: 5.58s
668:	learn: 0.8764479	total: 11.2s	remaining: 5.56s
669:	learn: 0.8751787	total: 11.2s	remaining: 5.54s
670:	learn: 0.8766196	total: 11.3s	remaining: 5.52s
671:	learn: 0.8768770	total: 11.3s	remaining: 5.5s
672:	learn: 0.8771344	total: 11.3s	remaining: 5.49s
673:	learn: 0.8773060	total: 11.3s	remaining: 5.47s
674:	learn: 0.8769717	total: 11.3s	remaining: 5.45s
675:	learn: 0.8776492	total: 11.3s	remaining: 5.43s
676:	learn: 0.8782410	total: 11.3s	remaining: 5.41s
677:	learn: 0

821:	learn: 0.8923366	total: 13.3s	remaining: 2.87s
822:	learn: 0.8925082	total: 13.3s	remaining: 2.85s
823:	learn: 0.8924135	total: 13.3s	remaining: 2.84s
824:	learn: 0.8918218	total: 13.3s	remaining: 2.82s
825:	learn: 0.8931768	total: 13.3s	remaining: 2.81s
826:	learn: 0.8939402	total: 13.3s	remaining: 2.79s
827:	learn: 0.8939402	total: 13.3s	remaining: 2.77s
828:	learn: 0.8942745	total: 13.4s	remaining: 2.76s
829:	learn: 0.8940170	total: 13.4s	remaining: 2.74s
830:	learn: 0.8942745	total: 13.4s	remaining: 2.72s
831:	learn: 0.8927567	total: 13.4s	remaining: 2.71s
832:	learn: 0.8926709	total: 13.4s	remaining: 2.69s
833:	learn: 0.8936059	total: 13.4s	remaining: 2.67s
834:	learn: 0.8931768	total: 13.4s	remaining: 2.66s
835:	learn: 0.8924224	total: 13.5s	remaining: 2.64s
836:	learn: 0.8924224	total: 13.5s	remaining: 2.62s
837:	learn: 0.8927657	total: 13.5s	remaining: 2.61s
838:	learn: 0.8933574	total: 13.5s	remaining: 2.59s
839:	learn: 0.8925172	total: 13.5s	remaining: 2.57s
840:	learn: 

982:	learn: 0.9048967	total: 15.8s	remaining: 274ms
983:	learn: 0.9048967	total: 15.8s	remaining: 258ms
984:	learn: 0.9050683	total: 15.9s	remaining: 241ms
985:	learn: 0.9050683	total: 15.9s	remaining: 225ms
986:	learn: 0.9050683	total: 15.9s	remaining: 209ms
987:	learn: 0.9052399	total: 15.9s	remaining: 193ms
988:	learn: 0.9047340	total: 15.9s	remaining: 177ms
989:	learn: 0.9047340	total: 15.9s	remaining: 161ms
990:	learn: 0.9046482	total: 15.9s	remaining: 145ms
991:	learn: 0.9047340	total: 15.9s	remaining: 129ms
992:	learn: 0.9046482	total: 15.9s	remaining: 112ms
993:	learn: 0.9045624	total: 16s	remaining: 96.3ms
994:	learn: 0.9048198	total: 16s	remaining: 80.3ms
995:	learn: 0.9043908	total: 16s	remaining: 64.2ms
996:	learn: 0.9047340	total: 16s	remaining: 48.1ms
997:	learn: 0.9049825	total: 16s	remaining: 32.1ms
998:	learn: 0.9053257	total: 16s	remaining: 16ms
999:	learn: 0.9054974	total: 16s	remaining: 0us
Model Accuracy: 0.762732174955063


In [15]:
X_test.head(5)

Unnamed: 0,MaMH,Major,SoTinChi,SiSoTKB,TKB_LT2,TKB_LT3,TKB_LT4,TKB_LT5,TKB_LT6,TKB_LT7,TKB_TH2,TKB_TH3,TKB_TH4,TKB_TH5,TKB_TH6,TKB_TH7,HK,GVLT,GVTH
7023,PE017IU,IU PE,2,90.0,0,0,0,0,0,3,0,0,0,0,0,0,1,Pham Dinh Huan,unassigned
5201,MAFE208IU,MA MA,4,50.0,7,0,0,0,0,0,0,0,0,0,0,0,2,Mai Duc Thanh,unassigned
4280,IS056IU,IEMIEM,1,25.0,0,0,0,0,0,0,0,0,0,0,7,0,2,unassigned,Nguyen Hoang An
940,CH012IU,BT BT,1,20.0,7,0,0,0,0,0,0,0,0,0,0,0,3,Le Nguyen Thien Phuc,unassigned
5364,BA130IU,BA BA,3,60.0,0,0,0,0,0,1,0,0,0,0,0,0,1,Nguyen Tran Nguyen Khai,unassigned


In [16]:
X_test.dtypes

MaMH         object
Major        object
SoTinChi      int64
SiSoTKB     float64
TKB_LT2       int64
TKB_LT3       int64
TKB_LT4       int64
TKB_LT5       int64
TKB_LT6       int64
TKB_LT7       int64
TKB_TH2       int64
TKB_TH3       int64
TKB_TH4       int64
TKB_TH5       int64
TKB_TH6       int64
TKB_TH7       int64
HK            int64
GVLT         object
GVTH         object
dtype: object

In [17]:
X_test.shape

(1669, 19)

In [18]:
# # Define the parameter grid
# param_grid = {
#     'iterations': [500, 1000],  # Number of boosting iterations
#     'depth': [4, 6, 8],          # Maximum depth of each tree
#     'learning_rate': [0.05, 0.1],# Learning rate
#     # Add more parameters to tune if needed
# }

# # Create CatBoost classifier
# catboost_clf = CatBoostClassifier(loss_function='Logloss',
#                                   eval_metric='Accuracy',
#                                   cat_features=['MaMH', 'Major', 'GVLT', 'GVTH'],
#                                   auto_class_weights='Balanced',
#                                   random_seed=42)

# # Perform grid search
# grid_search = GridSearchCV(estimator=catboost_clf, param_grid=param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train, y_train)

# # Best parameters found
# print("Best parameters:", grid_search.best_params_)

# # Best score
# print("Best accuracy:", grid_search.best_score_)

# # Predict on the test set using the best model
# best_clf = grid_search.best_estimator_
# y_pred = best_clf.predict(X_test)

# # Calculate accuracy
# accuracy = best_clf.score(X_test, y_test)
# print("Model Accuracy:", accuracy)

In [19]:
# # Convert y_resampled to a pandas Series
y_train_series = pd.Series(y_train)

# Count the occurrences of each class
class_counts2 = y_train_series.value_counts()

# Print the class counts
print("Class Counts in y_train_series:")
print(class_counts2)

Class Counts in y_train_series:
ClassifyAs
Class2    5827
Class1     845
Name: count, dtype: int64


In [20]:
# # Convert y_resampled to a pandas Series
# # y_resampled_series = pd.Series(y_resampled)
# y_resampled_series = pd.Series(y_train_resampled)

# # Count the occurrences of each class
# class_counts = y_resampled_series.value_counts()

# # Print the class counts
# print("Class Counts in y_resampled:")
# print(class_counts)

In [21]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Create a DataFrame for actual vs. predicted values
results_df = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten()})
results_df

Accuracy: 0.762732174955063


Unnamed: 0,Actual,Predicted
0,Class2,Class2
1,Class2,Class2
2,Class2,Class2
3,Class2,Class1
4,Class2,Class2
5,Class2,Class2
6,Class2,Class2
7,Class2,Class1
8,Class2,Class2
9,Class1,Class1


In [22]:
# Create a DataFrame to count total correct and incorrect predictions for each class
class_counts = {}

# Update counts based on actual vs. predicted values
for actual, predicted in zip(y_test, y_pred):
    if actual not in class_counts:
        class_counts[actual] = {'Correct': 0, 'Incorrect': 0}
    if actual == predicted:
        class_counts[actual]['Correct'] += 1
    else:
        class_counts[actual]['Incorrect'] += 1

# Convert dictionary to DataFrame
class_count_df = pd.DataFrame(class_counts).transpose().reset_index()
class_count_df.rename(columns={'index': 'Class'}, inplace=True)

# Calculate total correct and incorrect predictions across all classes
total_correct_all_classes = class_count_df['Correct'].sum()
total_incorrect_all_classes = class_count_df['Incorrect'].sum()

# Add a row for the total counts across all classes
total_row = pd.DataFrame({'Class': 'Total', 'Correct': total_correct_all_classes, 'Incorrect': total_incorrect_all_classes}, index=[0])
class_count_df = pd.concat([class_count_df, total_row], ignore_index=True)

# Display the DataFrame
print("\nClass-wise Correct and Incorrect Predictions:")
print(class_count_df)


Class-wise Correct and Incorrect Predictions:
    Class  Correct  Incorrect
0  Class2     1157        313
1  Class1      116         83
2   Total     1273        396


In [23]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# # Classification Report
# print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[ 116   83]
 [ 313 1157]]
              precision    recall  f1-score   support

      Class1       0.27      0.58      0.37       199
      Class2       0.93      0.79      0.85      1470

    accuracy                           0.76      1669
   macro avg       0.60      0.68      0.61      1669
weighted avg       0.85      0.76      0.80      1669



In [24]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.762732174955063
Precision: 0.8540525443490325
Recall: 0.762732174955063
F1-score: 0.7961123400375065


In [32]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Generate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Convert the classification report to a DataFrame
report_df = pd.DataFrame(report).transpose()

# Display the DataFrame with borders using IPython display
def display_with_borders(df):
    style = df.style.set_table_styles(
        [{'selector': 'th', 'props': [('border', '1px solid black')]},
         {'selector': 'td', 'props': [('border', '1px solid black')]}]
    )
    display(style)

display_with_borders(report_df)

Confusion Matrix:
[[ 116   83]
 [ 313 1157]]


Unnamed: 0,precision,recall,f1-score,support
Class1,0.270396,0.582915,0.369427,199.0
Class2,0.933065,0.787075,0.853875,1470.0
accuracy,0.762732,0.762732,0.762732,0.762732
macro avg,0.60173,0.684995,0.611651,1669.0
weighted avg,0.854053,0.762732,0.796112,1669.0


In [26]:
# # Define the filename to save your model
# model_filename = 'dhqt_90slots_classifier_model.cbm'
# # model_filename = 'dhqt_tr89_classifier_model.cbm'

# # Save the model
# catboost_clf.save_model(model_filename)

# print("Model saved successfully as", model_filename)