In [1]:
!pip install imbalanced-learn lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 429.7 kB/s eta 0:00:03
   -------------- ------------------------- 0.5/1.5 MB 429.7 kB/s eta 0:00:03
   --------------------- ------------------ 0.8/1.5 MB 441.3 kB/s eta 0:00:02
   --------------------- ------------------ 0.8/1.5 MB 441.3 kB/s eta 0:00:02
   --------------------- ------------------ 0.8/1.5 MB 44

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import lightgbm as lgb



In [4]:
from sklearn.datasets import make_classification

# Simulated imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                           n_redundant=5, weights=[0.9, 0.1], random_state=42)

df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

# Introduce missing values artificially
df.iloc[0:10, 0] = np.nan

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Split features and target
X = df.drop('target', axis=1)
y = df['target']


In [7]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_preds = svm_model.predict(X_test_scaled)


In [13]:
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_test)


[LightGBM] [Info] Number of positive: 716, number of negative: 720
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 1436, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498607 -> initscore=-0.005571
[LightGBM] [Info] Start training from score -0.005571


In [15]:
print("SVM Classification Report:")
print(classification_report(y_test, svm_preds))

print("LightGBM Classification Report:")
print(classification_report(y_test, lgb_preds))


SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       178
           1       0.96      1.00      0.98       182

    accuracy                           0.98       360
   macro avg       0.98      0.98      0.98       360
weighted avg       0.98      0.98      0.98       360

LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       178
           1       0.97      0.99      0.98       182

    accuracy                           0.98       360
   macro avg       0.98      0.98      0.98       360
weighted avg       0.98      0.98      0.98       360

