In [3]:
#Install necessary libraries
import numpy as np
import pandas as pd
import mlflow
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,cross_val_predict,StratifiedKFold
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# run the base line model

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [5]:
df.dropna(inplace=True)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df = df[~(df['clean_comment'].str.strip() == '')]

In [8]:
#ensure necessary nltk data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mukti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mukti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# define the preprocessing function

def preprocess_comment(comment):
    # convert to lower case
    comment = comment.lower()

    #remove the trailing and leading space
    comment = comment.strip()

    #remove the newline characters
    comment = re.sub('\n',' ',comment)

    #remove the non-alphanumeric character, except punctuations
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    #remove the stopwords but retain the important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) -{'not','but','however','no','yet'}
    comment = ' '.join([word for word in comment.split() if word.lower() not in stop_words])

    #lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment
    
     

In [10]:
#Apply the preprocessing function to clean clean_comment
df['clean_comment']= df['clean_comment'].apply(preprocess_comment)

In [11]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [None]:
#df.to_csv('reddit_preprocessing.csv',index=False)

In [None]:
## apply BOW
vectorizer = CountVectorizer(max_features=5000)

In [None]:
X = vectorizer.fit_transform(df['clean_comment']).toarray()
y = df['category'] # Assuming 'sentiment' is the target variable (0 or 1 for binary classification)

In [None]:
X

In [None]:
print(X.shape)
print(y.shape)

In [None]:
# step 2 : Set up the mlflow tracking server 
mlflow.set_tracking_uri("http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/")

In [None]:
mlflow.set_experiment("RF Baseline")

In [None]:
# design the base line model
#step 1: split the data into traing and testing sets(80% train, 20% test)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y) #stratify=y for imbalanced dataset

#step 2: Define and train a Random Forest baseline model using a  simpke test train split

with mlflow.start_run() as run:
    #log the description of the run
    mlflow.set_tag("mlflow.runName", "Randomforest_baseline_TrainTestSplit")
    mlflow.set_tag("experiment_type", "Baseline")
    mlflow.set_tag("model_type","RandomForestClassifier")

    # Add a description
    mlflow.set_tag("description", "Baseline RandomForest model for sentiment analysis using Bag of Words (BoW) with a simple train-test split")

    #log parameters for the vectorizers
    mlflow.log_param("vectorizer_type", "CountVectorizer")
    mlflow.log_param("vactorizer_max_features",vectorizer.max_features)

    #log Random Forest Classifier
    n_estimators = 150
    max_depth = 15

    mlflow.log_param("n_estimators",n_estimators)
    mlflow.log_param("max_depth",max_depth)

    #initialize and train the model
    model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=42)
    model.fit(X_train,y_train)

    # make the prediction
    y_pred = model.predict(X_test)

    # log metrics 
    accuracy = accuracy_score(y_test,y_pred)
    mlflow.log_metric("accuracy",accuracy)

    # classification report for each metric and log in mlflow

    classification_rep = classification_report(y_test,y_pred,output_dict=True)

    for label,metrics in classification_rep.items():
        if isinstance(metrics,dict):
            for metrics,value in metrics.item():   ###for precisison ,recall ,f1 score etc
                mlflow.log_metric(f"{label}_{metrics}",value)

    # confusion metrics plot
    conf_matrix = confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(conf_matrix,annot=True, fmt="d",cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")

    #save and log the confusion matrix plot
    plt.savefig('confusion_matrix/confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix/confusion_matrix.png')

    #log the model
    mlflow.sklearn.log_model(model,"random_forest_model")

    # Optionally log the dataset itself (if it's small enough)
    df.to_csv("dataset.csv", index=False)
    mlflow.log_artifact("dataset.csv")

# Display final accuracy
print(f"Accuracy: {accuracy}")

