In [1]:
import mlflow
mlflow.set_tracking_uri("http://ec2-54-234-131-60.compute-1.amazonaws.com:5000/")

In [2]:
mlflow.set_experiment("Exp-5 Lightgbm with HP Tuning")

<Experiment: artifact_location='s3://naman-mlflow-bucket/631830083224454248', creation_time=1751182714040, experiment_id='631830083224454248', last_update_time=1751182714040, lifecycle_stage='active', name='Exp-5 Lightgbm with HP Tuning', tags={}>

In [3]:
import pandas as pd
df=pd.read_csv('F:\\new_downloads\\archive - 2025-06-28T120017.952\\Reddit_Data.csv').dropna(subset=['clean_comment'])
df.shape

(37149, 2)

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from imblearn.over_sampling import SMOTE
import mlflow 
import mlflow.sklearn
import optuna
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt

In [5]:
df['category']=df['category'].map({-1:2,0:0,1:1})
df=df.dropna(subset=['category'])



In [6]:
ngram_range=(1,3)
max_features=1000
vectorizer=TfidfVectorizer(ngram_range=ngram_range,max_features=max_features)
X=vectorizer.fit_transform(df['clean_comment'])
y=df['category']

smote=SMOTE(random_state=42)
X_resampled,y_resampled=smote.fit_resample(X,y)
X_train,X_test,y_train,y_test=train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)


In [9]:
def log_mlflow(model_name,model,X_train,X_test,y_test,params,trial_number):
  with mlflow.start_run():
    mlflow.set_tag("mlflow.runName",f"{model_name}_SMOTE_lightgbm_TRIGRAMS")
    mlflow.set_tag("experiment_type","algorithm comparison")
    mlflow.log_param("algo_name",model_name)

    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)

    accuracy=accuracy_score(y_test,y_pred)
    mlflow.log_metric("accuracy",accuracy)

    classification_rep=classification_report(y_test,y_pred,output_dict=True)
    for label,metrics in classification_rep.items():
      if isinstance(metrics,dict):
        for metric,value in metrics.items():
          mlflow.log_metric(f"{label}_{metric}",value)
    
    mlflow.sklearn.log_model(model,f"{model_name}_model")


    

In [None]:
def objective_lightgbm(trial):
  n_estimators=trial.suggest_int('n_estimators',100,1000)
  learning_rate=trial.suggest_float('learning_rate',1e-4,1e-1,log=True)
  max_depth=trial.suggest_int('max_depth',3,15)
  num_leaves=trial.suggest_int('num_leaves',20,150)
  min_child_samples=trial.suggest_int('min_child_samples',10,100)
  colsample_bytree=trial.suggest_float('colsample_bytree',0.5,1.0)
  subsample=trial.suggest_float('subsample',0.5,1.0)
  reg_alpha=trial.suggest_float('reg_alpha',1e-4,10.0,log=True)
  reg_lambda=trial.suggest_float('reg_lambda',1e-4,10.0,log=True)

  params={
    'n_estimators':n_estimators,
    'learning_rate':learning_rate,
    'max_depth':max_depth,
    'num_leaves':num_leaves,
    'min_child_samples':min_child_samples,
    'colsample_bytree':colsample_bytree,
    'subsample':subsample,
    'reg_alpha':reg_alpha,
    'reg_lambda':reg_lambda
  }

  model=LGBMClassifier(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth,num_leaves=num_leaves,min_child_samples=min_child_samples,colsample_bytree=colsample_bytree,subsample=subsample,reg_alpha=reg_alpha,reg_lambda=reg_lambda,random_state=42)

  accuracy=log_mlflow("LightGBM",model,X_train,X_test,y_train,y_test,params,trial.number)
  return accuracy

In [None]:
def run_optuna_experiment():
  study=optuna.create_study(direction='maximize')
  study.optimize(objective_lightgbm,n_trials=10)
  best_params=study.best_params
  best_model=LGBMClassifier(n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],max_depth=best_params['max_depth'],num_leaves=best_params['num_leaves'],min_child_samples=best_params['min_child_samples'],colsample_bytree=best_params['colsample_bytree'],subsample=best_params['subsample'],reg_alpha=best_params['reg_alpha'] ,reg_lambda=best_params['reg_lambda'],random_state=42                      )

  log_mlflow("LightGBM",best_model,X_train,X_test,y_train,y_test,best_params,"Best")

  optuna.visualization.plot_param_importance(study).show()
  optuna.visualization.plot_optimization_history(study).show()


In [13]:
run_optuna_experiment()

[I 2025-06-29 14:33:10,927] A new study created in memory with name: no-name-28d32baa-5837-4a88-a166-53dbff564ebc
[W 2025-06-29 14:33:10,933] Trial 0 failed with parameters: {'n_estimators': 174} because of the following error: ValueError('The `low` value must be equal to or greater than 1 for a log distribution (low=0.0001, high=0.1).').
Traceback (most recent call last):
  File "c:\Users\hi\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\hi\AppData\Local\Temp\ipykernel_4812\2716921195.py", line 3, in objective_lightgbm
    learning_rate=trial.suggest_int('learning_rate',1e-4,1e-1,log=True)
  File "c:\Users\hi\anaconda3\lib\site-packages\optuna\_convert_positional_args.py", line 134, in converter_wrapper
    return func(**kwargs)  # type: ignore[call-arg]
  File "c:\Users\hi\anaconda3\lib\site-packages\optuna\trial\_trial.py", line 323, in suggest_int
    distribution = IntDistribution(low=low, high=hig

ValueError: The `low` value must be equal to or greater than 1 for a log distribution (low=0.0001, high=0.1).