In [None]:
%%capture
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn.metrics import f1_score,accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from imblearn.combine import SMOTETomek
from collections import Counter
# import statistics


# Đọc, chia dữ liệu

In [None]:
df_Final= pd.read_csv('df_nhanh13141516.csv')
df_Final = df_Final.iloc[: , 1:]
df_Final

In [None]:
seed = 1
df_train, df_val = train_test_split(df_Final, test_size=0.25,shuffle=True ,random_state=seed,stratify=df_Final["nhan_xuhuong"])
X_train = df_train.copy()
y_train = X_train.pop("nhan_xuhuong")
print(y_train.value_counts())
X_val = df_val.copy()
y_val = X_val.pop("nhan_xuhuong")
print(y_val.value_counts())
# le=preprocessing.LabelBinarizer() --ko cân bằng trực tiếp được và cho kết quả thấp hơn LabelEncoder()
le = preprocessing.LabelEncoder()
le=le.fit(y_train)

y_train =le.transform(y_train)
y_val=le.transform(y_val)
print(y_train)
print(y_val)

In [None]:
X_train

In [None]:
y_train

# Tiền xử lý và mã hóa

In [None]:
Somon_canlay=19

In [None]:
# OneHotEncoder để mặc định ma trận thưa thớt sparse=true giúp giảm kích thước bộ nhớ
cat_cols = []

cat_cols.extend(['gioitinh','noisinh','cmnd_noicap'
                  ,'lopsh','khoa','hedt_sv','chuyennganh','chuyennganh2','nganh_tt','dantoc','tongiao',
                 'xuatthan','doituong','khuvuc'])
for i in range(Somon_canlay):
    mamh_='mamh_'+str(i+1)
    trangthaimon_='trangthaimon_'+str(i+1)
    hinhthucthmon_='hinhthucthmon_'+str(i+1)
    ngonngumon_='ngonngumon_'+str(i+1)
    khoaqlmon_='khoaqlmon_'+str(i+1)
    hedt_monhoc_='hedt_monhoc_'+str(i+1)
    hocky_mon_='hocky_mon_'+str(i+1)
    cat_cols.extend([mamh_,trangthaimon_,
                        hinhthucthmon_,ngonngumon_,
                        khoaqlmon_,hedt_monhoc_,hocky_mon_])

# cat_cols.extend(['loaihocluc_hk1','loaihocluc_hk2','diff_loaihocluc_hk'
#                  ,'loaihoclucnam'])

cat_cols= np.array(cat_cols)
cat_cols

In [None]:
num_cols = []

num_cols.extend(['tuoivaodh','tuoicapcmnd'])
for i in range(Somon_canlay):
    diemmon_='diemmon_'+str(i+1)

    num_cols.extend([diemmon_])

num_cols.extend(['dtb_hk1','sotc_hk1','dtb_hk2','sotc_hk2',
                    'tong_sotc','dtbnam'])
    

num_cols= np.array(num_cols)
num_cols

In [None]:
cat_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)


In [None]:
num_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols),
    ]
)

In [None]:
#out put là numpy array
X_train=preprocessor.fit_transform(X_train, y_train)
X_val=preprocessor.transform(X_val)

In [None]:
from joblib import dump
dump(preprocessor, "preprocessor_Final_huong13_Randomforest.dat")
print("Saved model to: preprocessor_Final_huong13_Randomforest.dat")

# CB dữ liệu: Cb dữ liệu huấn luyện, KO cb dữ liệu test

In [None]:
smote_tomek = SMOTETomek(random_state=seed)

X_train, y_train = smote_tomek.fit_resample(X_train,y_train)

print(sorted(Counter(y_train).items()))

# DecisionTree

In [None]:

model_DT=DecisionTreeClassifier(random_state=seed)

# training
model_DT.fit(X_train, y_train)

# training metric
y_train_pred_DT = model_DT.predict(X_train)
print(f"Accuracy score on train data: {accuracy_score(list(y_train), list(y_train_pred_DT)):.2f}")

# validation metric
y_pred_DT = model_DT.predict(X_val)

In [None]:
print(classification_report(y_val, y_pred_DT))

# RF

In [None]:
#min_impurity_decrease ngưỡng ngừng phát triển, dừng sớm để đỡ bọ overfiting
model_RF=RandomForestClassifier(random_state=seed)

# training
model_RF.fit(X_train, y_train)

# training metric
y_train_pred_RF = model_RF.predict(X_train)
print(f"Accuracy score on train data: {accuracy_score(list(y_train), list(y_train_pred_RF)):.2f}")

# validation metric
y_pred_RF = model_RF.predict(X_val)


In [None]:
print(classification_report(y_val, y_pred_RF))

In [None]:
print(f"Accuracy score on validation data: {accuracy_score(list(y_val), list(y_pred_RF))}")

In [None]:
f1_score(y_val, y_pred_RF,average=None)

In [None]:
f1_score(y_val, y_pred_RF, average='macro')

In [None]:
f1_score(y_val, y_pred_RF, average='micro')

# Xgboost

In [None]:

model_XGB=xgb.XGBClassifier(random_state=seed)

# training
model_XGB.fit(X_train, y_train)

# training metric
y_train_pred_XGB = model_XGB.predict(X_train)
print(f"Accuracy score on train data: {accuracy_score(list(y_train), list(y_train_pred_XGB)):.2f}")

# validation metric
y_pred_XGB = model_XGB.predict(X_val)


In [None]:
print(classification_report(y_val, y_pred_XGB))

# Feartures selection

In [None]:
# Fit model using each importance as a threshold
thresholds = np.sort(model_RF.feature_importances_)
# thresholds =thresholds.tolist()
# mean=statistics.mean(thresholds)
# thresholds = [t for t in thresholds if t >= mean]
# thresholds=sorted(set(thresholds))
thresholds

In [None]:
nonzero_thresholds= thresholds[np.nonzero(thresholds)]
nonzero_thresholds=np.sort(nonzero_thresholds)
nonzero_thresholds

In [None]:
nonzero_thresholds[0]

In [None]:

# select features using threshold
selection = SelectFromModel(model_RF, threshold=nonzero_thresholds[0], prefit=True)
select_X_train = selection.transform(X_train)

# train model
selection_model = RandomForestClassifier(random_state=seed)
selection_model.fit(select_X_train, y_train)

# eval model
select_X_val = selection.transform(X_val)
select_y_pred = selection_model.predict(select_X_val)


In [None]:
print(classification_report(y_val, select_y_pred))

In [None]:
dump(model_RF, "M_Final_huong13_Randomforest.dat")
print("Saved model to: M_Final_huong13_Randomforest.dat")

In [None]:
# # RandomOverSample
# ros = RandomOverSampler(random_state=seed)

# X_train, y_train = ros.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))

In [None]:
# # RandomUnderSampler
# rus = RandomUnderSampler(random_state=seed)

# X_train, y_train = rus.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))


In [None]:
# svmsmote = SVMSMOTE(random_state=seed)

# X_train, y_train = svmsmote.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))

In [None]:

# blsmote = BorderlineSMOTE(random_state=seed)

# X_train, y_train = blsmote.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))

In [None]:
# # smote phải là số đã
# smote = SMOTE(random_state=seed)

# X_train, y_train = smote.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))

In [None]:

# kmsmote = KMeansSMOTE(random_state=seed)

# X_train, y_train = kmsmote.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))

In [None]:

# adasyn = ADASYN(random_state=seed)

# X_train, y_train = adasyn.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))


In [None]:

# cc = ClusterCentroids(random_state=seed)

# X_train, y_train = cc.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))

In [None]:
# smote_enn = SMOTEENN(random_state=seed)

# X_train, y_train = smote_enn.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))

# Thuật toán có khả năng cân bằng trước khi mã hóa

In [None]:
# categorical_var = np.where(X_train.dtypes != np.float)[0]
# # smote phải là số đã
# smotenc = SMOTENC(categorical_var,random_state=seed)

# X_train, y_train = smotenc.fit_resample(X_train,y_train)

# print(sorted(Counter(y_train).items()))