## Importing files and dataset preview

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn

In [5]:
df = pd.read_csv('dataset/attack_parsed_dataset.csv')
df

Unnamed: 0,index,Text,ed_label_0,ed_label_1,oh_label
0,0,`- This is not ``creative``. Those are the di...,1.000000,0.000000,0
1,1,` :: the term ``standard model`` is itself le...,1.000000,0.000000,0
2,2,"True or false, the situation as of March 200...",1.000000,0.000000,0
3,3,"Next, maybe you could work on being less cond...",0.555556,0.444444,0
4,4,This page will need disambiguation.,1.000000,0.000000,0
...,...,...,...,...,...
115859,115859,` These sources don't exactly exude a sense ...,0.888889,0.111111,0
115860,115860,The Institute for Historical Review is a pee...,0.900000,0.100000,0
115861,115861,:The way you're trying to describe it in this...,1.000000,0.000000,0
115862,115862,== Warning == There is clearly a protection...,0.800000,0.200000,0


**Ml Flow**

In [23]:
# Start MLflow experiment
mlflow.set_experiment("Cyberbullying Detection Experiment")

<Experiment: artifact_location='file:///C:/College/Matkul%20Kuliah%20dan%20Tugas/MLOPS/final_project_jupyter/mlruns/232707165053857446', creation_time=1731280035151, experiment_id='232707165053857446', last_update_time=1731280035151, lifecycle_stage='active', name='Cyberbullying Detection Experiment', tags={}>

In [24]:
# Select relevant columns
X = df['Text']  # Feature: text
y = df['oh_label']  # Target: oh_label (1 for attack, 0 for not attack)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into TF-IDF features
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)  # Using TF-IDF for text representation
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

with mlflow.start_run():
    # Define and train a simple Logistic Regression model
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)

    # Make predictions
    y_pred = model.predict(X_test_tfidf)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Log parameters
    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("max_features", 5000)
    mlflow.log_param("test_size", 0.2)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model")
    # log the vectorizer to implement it in streamlit as well
    mlflow.sklearn.log_model(tfidf, "vectorizer")

    # Print the classification report for inspection
    print(f'Accuracy: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))



Accuracy: 0.9446
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     20465
           1       0.89      0.60      0.72      2708

    accuracy                           0.94     23173
   macro avg       0.92      0.80      0.84     23173
weighted avg       0.94      0.94      0.94     23173



In [17]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)