In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("dataset.csv")

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [7]:
x = df['text']
y = df['label']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
tfid = TfidfVectorizer(ngram_range=(1,1), max_features=8000)

In [10]:
x_train_tfid = tfid.fit_transform(x_train)
x_test_tfid = tfid.fit_transform(x_test)

In [11]:
lgbm = LGBMClassifier(
    class_weight="balanced",
    reg_alpha=0.1,
    reg_lambda=0.2,
    n_estimators=367,
    max_depth=20
)

log = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', multi_class='auto')

knn = KNeighborsClassifier(n_neighbors=5)

stacking = StackingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('log', log),
        ('nb', MultinomialNB())
    ],
    final_estimator=knn,
    cv=5
)

stacking.fit(x_train_tfid, y_train)

y_pred = stacking.predict(x_test_tfid)

print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 15857, number of negative: 15802
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.388163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 554033
[LightGBM] [Info] Number of data points in the train set: 31659, number of used features: 7970
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 12686, number of negative: 12641
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.396029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 478691
[LightGBM] [Info] Number of data points in the train set: 25327, number of used features: 7910
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 12686, number of negative: 12641
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.309199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475899
[LightGBM] [Info] Number of data points in the train set: 25327, number of used features: 7913
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 12685, number of negative: 12642
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.272778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 475926
[LightGBM] [Info] Number of data points in the train set: 25327, number of used features: 7909
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 12685, number of negative: 12642
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.315619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 477578
[LightGBM] [Info] Number of data points in the train set: 25327, number of used features: 7910
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 12686, number of negative: 12642
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.345372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 477984
[LightGBM] [Info] Number of data points in the train set: 25328, number of used features: 7919
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




              precision    recall  f1-score   support

           0       0.52      0.41      0.46      3950
           1       0.51      0.61      0.56      3965

    accuracy                           0.51      7915
   macro avg       0.51      0.51      0.51      7915
weighted avg       0.51      0.51      0.51      7915





In [12]:
import mlflow 
import mlflow.sklearn

In [13]:
mlflow.set_tracking_uri("http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/")

In [14]:
mlflow.set_experiment("Stacking")

<Experiment: artifact_location='s3://my-mlflow-bucket-123/166160491319823637', creation_time=1755854632886, experiment_id='166160491319823637', last_update_time=1755854632886, lifecycle_stage='active', name='Stacking', tags={}>

In [16]:
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "Stacking_Models")
    mlflow.set_tag("experiment_type", "stacking")

    mlflow.log_param("estimator1", "LGBMClassifier")
    mlflow.log_param("estimator2", "LogisticRegression")
    mlflow.log_param("estimator3", "MultinomialNB")
    mlflow.log_param("estimator4", "KNearestNeighbour")

    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))

    class_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in class_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    mlflow.sklearn.log_model(stacking, "StackingClassifier")



🏃 View run Stacking_Models at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/166160491319823637/runs/d2c8a45ca9ee4b5ca76f491f76fd365d
🧪 View experiment at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/166160491319823637
