In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import read_csv
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
import lightgbm as lgb
from sklearn import svm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTEN
import re

In [None]:
df = pd.read_csv("german.data-numeric", 
                 sep=r"\s+",        # regex: 1 hoặc nhiều khoảng trắng
                 header=None)       # không có header trong file

# Xem trước dữ liệu
print(df.head())
print(df.shape)

In [None]:
columns = [str(i) for i in range(1, 25)] + ['target']
df.columns = columns
print(df.head())

In [None]:
df["target"] = df["target"].replace({1: 0, 2: 1})
print(df.info())

In [None]:
original_df = df.copy()

In [None]:
x = df.drop('target', axis = 1)
y = df['target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(x_train, x_test, y_train, y_test)
print(models)

In [None]:
# Giả sử df đã là DataFrame với các cột số (numeric)
corr_matrix = df.corr()

for col in corr_matrix.columns:
    # Lấy tương quan của col với các cột khác
    corr_with_col = corr_matrix[col].drop(col)

    # Tìm feature có tương quan cao nhất tuyệt đối
    max_corr_feature = corr_with_col.abs().idxmax()
    max_corr_value = corr_with_col[max_corr_feature]

    print(f"Feature '{col}' tương quan cao nhất với '{max_corr_feature}' = {max_corr_value:.3f}")

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifierCV, RidgeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
model = LGBMClassifier(random_state=42)

param_grid = {
    "num_leaves": [15, 31, 63, 127],          # số lá trên mỗi cây
    "max_depth": [-1, 5, 10, 20, 30],        # độ sâu tối đa của cây (-1 nghĩa là không giới hạn)
    "learning_rate": [0.01, 0.05, 0.1, 0.2], # tốc độ học
    "n_estimators": [50, 100, 200, 500],     # số lượng cây boosting
    "subsample": [0.6, 0.8, 1.0],            # tỷ lệ mẫu dùng cho mỗi cây (bagging fraction)
    "colsample_bytree": [0.6, 0.8, 1.0],     # tỷ lệ cột (feature fraction)
    "reg_alpha": [0, 0.01, 0.1, 1],          # L1 regularization
    "reg_lambda": [0, 0.01, 0.1, 1],         # L2 regularization
    "min_child_samples": [5, 10, 20, 50],    # số lượng mẫu tối thiểu trong 1 leaf
    "boosting_type": ["gbdt", "dart"],       # loại boosting: gbdt (chuẩn), dart (dropout boosting)
    "objective": ["binary"],                 # với bài toán phân loại nhị phân
}

In [None]:
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 1)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
y_proba = grid.predict_proba(x_test)[:, 1]
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 4))
auc = roc_auc_score(y_test, y_proba)
print("AUC =", round(auc, 4))

In [None]:
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print(classification_report(y_test, y_predict, digits = 4))

auc = roc_auc_score(y_test, y_proba)
print("AUC =", round(auc, 4))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

print("Trước khi oversampling:", Counter(y_train))

ros = RandomOverSampler(random_state=42)
x_train, y_train = ros.fit_resample(x_train, y_train)

print("Sau khi oversampling:", Counter(y_train))

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
model = LGBMClassifier(random_state=42)

param_grid = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "num_leaves": [31, 63],
    "max_depth": [-1, 5, 10],
    "min_child_samples": [5, 10],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8],
    "reg_alpha": [0, 0.1],
    "reg_lambda": [0, 0.1]
}

In [None]:
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'accuracy', cv = 4, verbose = 1)
grid.fit(x_train, y_train)
y_predict = grid.predict(x_test)
y_proba = grid.predict_proba(x_test)[:, 1]
print(grid.best_params_)
print(grid.best_score_)
print(classification_report(y_test, y_predict, digits = 4))
auc = roc_auc_score(y_test, y_proba)
print("AUC =", round(auc, 4))

In [None]:
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print(classification_report(y_test, y_predict, digits = 4))

auc = roc_auc_score(y_test, y_proba)
print("AUC =", round(auc, 4))