In [6]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import TargetEncoder
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_path = "cityu10c_train_dataset.csv"
test_path = "cityu10c_test_dataset.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

label_col = "LoanApproved"

columns_to_drop = ["ID", "ApplicationDate"]
df_train.drop(columns=columns_to_drop, errors="ignore", inplace=True)
df_test.drop(columns=columns_to_drop, errors="ignore", inplace=True)

categorical_features = df_train.select_dtypes(include=["object"]).columns.tolist()
numerical_features = df_train.select_dtypes(include=["int64", "float64"]).columns.drop(label_col, errors="ignore").tolist()

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # Điền giá trị trung vị
    ("scaler", StandardScaler())  # Chuẩn hóa dữ liệu
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Điền giá trị phổ biến nhất
    ("encoder", TargetEncoder())  # Mã hóa danh mục bằng Target Encoding
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="drop"
)

pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),  
    ("smote", SMOTE(random_state=42)),  
    ("classifier", XGBClassifier(colsample_bytree=0.8, learning_rate=0.1, max_depth=3,
                                 n_estimators=500, subsample=0.8, random_state=42))  # Mô hình XGBoost
])

X_train, X_val, y_train, y_val = train_test_split(df_train.drop(columns=[label_col]), df_train[label_col], test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("XGBoost Accuracy:", accuracy)
import pickle

# Lưu pipeline vào file
with open("xgboost_pipeline.pkl", "wb") as file:
    pickle.dump(pipeline, file)
print("Lưu xong")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

XGBoost Accuracy: 0.956875
Lưu xong


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [1]:
import imblearn
import sklearn
print("scikit-learn version:", sklearn.__version__)
print("imbalanced-learn version:", imblearn.__version__)

scikit-learn version: 1.3.0
imbalanced-learn version: 0.11.0


In [5]:
import category_encoders

print(category_encoders.__version__)


2.6.1
