In [1]:
!pip install mlflow



In [5]:
# Test mlflow

import mlflow
mlflow.set_tracking_uri("http://ec2-54-86-42-121.compute-1.amazonaws.com:5000/")

with mlflow.start_run():
    mlflow.log_param("param1", 15)
    mlflow.log_metric("metric1", 0.89)

2025/12/06 21:41:53 INFO mlflow.tracking._tracking_service.client: üèÉ View run clumsy-eel-835 at: http://ec2-54-86-42-121.compute-1.amazonaws.com:5000/#/experiments/0/runs/811c39ebd0284a1eb355097df6de06c9.
2025/12/06 21:41:53 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://ec2-54-86-42-121.compute-1.amazonaws.com:5000/#/experiments/0.
2025/12/06 21:41:53 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://ec2-54-86-42-121.compute-1.amazonaws.com:5000/#/experiments/0.


In [6]:
# Baseline model
import numpy as np
import pandas as pd


In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()


Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [8]:
df.dropna(inplace=True)


In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df = df[~(df['clean_comment'].str.strip() == '')]

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [12]:
nltk.download("stopwords")
nltk.download("wordnet")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# Preprocessing

def preprocess_comment(comment):
    comment = comment.lower()
    comment = comment.strip()
    comment = re.sub(r"\n", " ", comment)
    comment = re.sub(r"[^A-Za-z0-9\s!?.,]", "", comment)

    # Correct NLTK stopwords usage
    stop_words = set(stopwords.words("english")) - {"not", "but", "however", "no", "yet"}
    comment = " ".join([word for word in comment.split() if word not in stop_words])

    lemmatizer = WordNetLemmatizer()
    comment = " ".join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [14]:
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

In [15]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [16]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [17]:
# Vectorize comments
vectorizer = CountVectorizer(max_features=10000)

In [18]:
X = vectorizer.fit_transform(df['clean_comment']).toarray()
y = df['category']

In [19]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
X.shape

(36793, 10000)

In [21]:
y

0        1
1        1
2       -1
3        0
4        1
        ..
37244    0
37245    1
37246    0
37247    1
37248    0
Name: category, Length: 36793, dtype: int64

In [22]:
y.shape

(36793,)

In [23]:
# MLflow tracking server
mlflow.set_tracking_uri("http://ec2-54-86-42-121.compute-1.amazonaws.com:5000/")

In [24]:
# Experiment with baseline model
mlflow.set_experiment("RF Baseline")

2025/12/06 23:01:46 INFO mlflow.tracking.fluent: Experiment with name 'RF Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-s3-bucket-25/337114401848294445', creation_time=1765042306177, experiment_id='337114401848294445', last_update_time=1765042306177, lifecycle_stage='active', name='RF Baseline', tags={}>