In [None]:
import pandas as pd
import mlflow
import mlflow.sklearn
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


low=pd.read_csv('../../data/processed/to_ML_models/LowRecords.csv',index_col=0)
med=pd.read_csv('../../data/processed/to_ML_models/MediumRecords.csv',index_col=0)
high=pd.read_csv('../../data/processed/to_ML_models/HighRecords.csv',index_col=0)
low['Degree_of_Plagiarism']='Low'
med['Degree_of_Plagiarism']='Medium'
high['Degree_of_Plagiarism']='High'
Final_data=pd.concat([low,med,high])
def bias_variance_error(model,X_train,y_train,X_test, y_test):
    X_train,y_train,X_test,y_test=X_train.values,y_train.values,X_test.values,y_test.values
    avg_expected_loss, avg_bias, avg_variance = bias_variance_decomp(
    model, X_train, y_train, X_test, y_test, 
    loss='0-1_loss',
    random_seed=123)
    return [avg_bias,avg_variance]

X, y =  Final_data[['Hyperbole', 'Text_similarity', 'Readability',
       'Style_Consistency']], Final_data[['Degree_of_Plagiarism']]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


mlflow.set_experiment("Plagiarism_Detection_Evaluation")


with mlflow.start_run(run_name="LogisticRegression"):
    lr_param_grid = {
    'C': [0.1, 1, 10],  
    'max_iter': [100, 500, 1000]
                    }
    for i in lr_param_grid['C']:
        for j in lr_param_grid['max_iter']:
            lr_model = LogisticRegression(C=i,max_iter=j)
            lr_model.fit(X_train, y_train)
            y_pred_lr = lr_model.predict(X_test)
            mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_lr))
            mlflow.log_metric("precision", precision_score(y_test, y_pred_lr, average="weighted"))
            mlflow.log_metric("recall", recall_score(y_test, y_pred_lr, average="weighted"))
            mlflow.log_metric("f1", f1_score(y_test, y_pred_lr, average="weighted"))
            mlflow.log_metric("C", i)
            mlflow.log_metric("Max_iterations", j)
            mlflow.log_metric("Bias_Error",bias_variance_error(lr_model,X_train,y_train,X_test,y_test)[0])
            mlflow.log_metric("Variance_Error",bias_variance_error(lr_model,X_train,y_train,X_test,y_test)[1])


with mlflow.start_run(run_name="RandomForest"):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

   
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_rf))
    mlflow.log_metric("precision", precision_score(y_test, y_pred_rf, average="weighted"))
    mlflow.log_metric("recall", recall_score(y_test, y_pred_rf, average="weighted"))
    mlflow.log_metric("f1", f1_score(y_test, y_pred_rf, average="weighted"))


mlflow.end_run()

print("Models trained and metrics logged successfully!")
