In [None]:
! pip install nltk textacy
! pip install scikit-learn
! pip install mlflow
! pip install dagshub



In [None]:
import pandas as pd
import numpy as np
from textacy import preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import mlflow
import dagshub
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def pre_processing(df):
  print("Shape of the data frame", df.shape)
  print("Duplicates", df.duplicated().sum())
  print("Null Values:" ,df.isnull().sum())

  print("Dropping the duplicate records.....")
  df.drop_duplicates(inplace=True)

  print("Dropping the null values")
  df.dropna(inplace=True)

  print("Changing data to lower case")
  df['clean_comment'] = df['clean_comment'].str.lower()

  df['length_clean_comment'] = df['clean_comment'].apply(lambda x: len(str(x)))

  print("Strip off the white spaces..")
  df['clean_comment'] = df['clean_comment'].str.replace(r'\s+', ' ', regex=True).str.strip()
  df['length_clean_comment_nowhite_space'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have white spaces:" , df[df['length_clean_comment']!= df['length_clean_comment_nowhite_space']].shape[0])

  print("Removing Html tags....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.html_tags)
  df['length_nowhite_space_htmltag'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have Html tags:" , df[df['length_clean_comment_nowhite_space']!= df['length_nowhite_space_htmltag']].shape[0])

  print("Removing Punctuation....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_htmltag_punctuation'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have punctuation:" , df[df['length_nowhite_space_htmltag']!= df['length_htmltag_punctuation']].shape[0])

  print("Removing brackets....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_punctuation_brackets'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have brackets:" , df[df['length_htmltag_punctuation']!= df['length_punctuation_brackets']].shape[0])

  # Apply the function to the 'clean_comment' column in a single line
  df['clean_comment'] = df['clean_comment'].apply(lambda x: preprocessing.replace.emojis(x, ""))
  df['length_brackets_emojis'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have emojis:" , df[df['length_punctuation_brackets']!= df['length_brackets_emojis']].shape[0])


  # Regular expression to match emojis
  emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # Emoticons
                            "\U0001F300-\U0001F5FF"  # Symbols and Pictographs
                            "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                            "\U0001F700-\U0001F77F"  # Alchemical Symbols
                            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            "\U0001FA00-\U0001FA6F"  # Chess Symbols
                            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            "\U00002702-\U000027B0"  # Dingbats
                            "\U000024C2-\U0001F251"  # Enclosed characters
                            "]", flags=re.UNICODE)

  # Filter out rows where 'clean_comment' contains emojis
  print("Number of non meaning  rows:", df[df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))].shape)
  df = df[~df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))]


  # List of words to keep even if their length is less than 4
  keep_words = ['lol', 'wow', 'wtf', 'fun', 'sad', 'old']

  # Filter out rows with clean_comment length < 4 unless they contain one of the keep_words
  df = df[(df['length_clean_comment'] >= 4) | df['clean_comment'].isin(keep_words)]

  return df[['clean_comment', 'category', 'length_clean_comment']]


# Define the preprocessing function
def preprocess_comment1(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    return comment

def preprocess_comment2(comment):

      # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])
    return comment

def preprocess_unigram(df):
    df = pre_processing(df)
    df['clean_comment'] = df['clean_comment'].apply(preprocess_comment1)
    df['clean_comment'] = df['clean_comment'].apply(preprocess_comment2)
    return df

def preprocess_multigram(df):
    df = pre_processing(df)
    df['clean_comment'] = df['clean_comment'].apply(preprocess_comment1)
    return df

In [None]:

df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.3, random_state=42, stratify=df['category'])



# Concatenate DataFrames side by side
result = pd.concat([X_train, y_train], axis=1)
result1 = preprocess_unigram(result)

# Data for training
X_train_uni = result1['clean_comment']
y_train_uni = result1['category']

result_test = pd.concat([X_test, y_test], axis=1)
result_test1 = preprocess_unigram(result_test)

# Data for testing
X_test_uni = result_test1['clean_comment']
y_test_uni = result_test1['category']


result = pd.concat([X_train, y_train], axis=1)
result1 = preprocess_multigram(result)

# Data for training
X_train_multi = result1['clean_comment']
y_train_multi = result1['category']

result_test = pd.concat([X_test, y_test], axis=1)
result_test1 = preprocess_multigram(result_test)

# Data for testing
X_test_multi = result_test1['clean_comment']
y_test_multi = result_test1['category']

Shape of the data frame (26074, 2)
Duplicates 277
Null Values: clean_comment    75
category          0
dtype: int64
Dropping the duplicate records.....
Dropping the null values
Changing data to lower case
Strip off the white spaces..
Number of rows have white spaces: 22730
Removing Html tags....
Number of rows have Html tags: 0
Removing Punctuation....
Number of rows have punctuation: 0
Removing brackets....
Number of rows have brackets: 0
Number of rows have emojis: 39
Number of non meaning  rows: (109, 8)
Shape of the data frame (11175, 2)
Duplicates 88
Null Values: clean_comment    25
category          0
dtype: int64
Dropping the duplicate records.....
Dropping the null values
Changing data to lower case
Strip off the white spaces..
Number of rows have white spaces: 9755
Removing Html tags....
Number of rows have Html tags: 0
Removing Punctuation....
Number of rows have punctuation: 0
Removing brackets....
Number of rows have brackets: 0
Number of rows have emojis: 16
Number of non 

In [None]:
X_train_uni.iloc[1:]

Unnamed: 0,clean_comment
34723,
25401,hey maybe poor lobbyist super pac would get want
35461,actual fuck really say sarcasm mean lawyer not...
28317,chill fuck going attack karachi city bullshit ...
13636,keep hearing great thing india good guy
...,...
17682,justice department need shut former president ...
28314,enter pak airspace today saw video circulated ...
8918,going retire get married live man
22000,ouch bite south aiadmk controlled bjp anger ch...


In [None]:


dagshub.init(repo_owner='MitVinay', repo_name='youtube_chrome', mlflow=True)
mlflow.set_experiment("Exp1-Feature_eng")

# Start parent run
with mlflow.start_run() as parent_run:
# Define the n-grams range
  n_grams = [(1, 1), (1, 2), (1, 3)]

  # Automate the process for both CountVectorizer and TfidfVectorizer
  vectorizers = {
      "CountVectorizer": CountVectorizer,
      "TfidfVectorizer": TfidfVectorizer
  }

  for ngram in n_grams:
      for vect_name, vect_class in vectorizers.items():
          with mlflow.start_run(nested=True, run_name=f"{ngram} using {vect_name}_new_version") as child_run:
            print(f"Testing with n-gram range: {ngram} using {vect_name}")

            mlflow.set_tag("mlflow.runName", f"{vect_name}_{ngram}_RandomForest")
            mlflow.set_tag("experiment_type", "feature_engineering")
            mlflow.set_tag("model_type", "RandomForestClassifier")
            # Initialize the vectorizer
            vectorizer = vect_class(ngram_range=ngram, max_features=1000)

            if ngram == (1, 1):

              # Data for testing
              X_test = X_test_uni
              y_test = y_test_uni

              # Data for training
              X_train = X_train_uni
              y_train = y_train_uni
              print("Splitting done")
            else:
              X_test = X_test_multi
              y_test = y_test_multi

              X_train = X_train_multi
              y_train = y_train_multi

            X_train_vect = vectorizer.fit_transform(X_train)
            X_test_vect = vectorizer.transform(X_test)
            print("Vectorizer done")
            # Train the RandomForestClassifier
            rf = RandomForestClassifier(random_state=42)
            rf.fit(X_train_vect, y_train)
            print("Random forest done")

            # Predict and evaluate
            y_pred = rf.predict(X_test_vect)
            metrics = classification_report(y_test, y_pred, output_dict=True)

            for label, metrics_dict in metrics.items():
              if label != 'accuracy':  # 'accuracy' is logged separately as a single value
                  for metric, value in metrics_dict.items():
                      mlflow.log_metric(f"{label}_{metric}", value)
              else:
                  # Log the accuracy score separately
                  mlflow.log_metric("accuracy", metrics_dict)


            mlflow.log_param("max_features", 1000)
            mlflow.log_param("ngram_range", ngram)
            mlflow.log_param("vectorizer", vect_name)
            mlflow.sklearn.log_model(rf, f"random_forest_model_{vect_name}_{ngram}")

            conf_matrix = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
            plt.xlabel("Predicted")
            plt.ylabel("Actual")
            plt.title(f"Confusion Matrix: {vect_name}, {ngram}")
            plt.savefig("confusion_matrix.png")
            mlflow.log_artifact("confusion_matrix.png")
            plt.close()

Testing with n-gram range: (1, 1) using CountVectorizer
Splitting done
Vectorizer done
Random forest done




🏃 View run CountVectorizer_(1, 1)_RandomForest at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/cce5bb0060dc4d61b0d8a8c23b7c1aca
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (1, 1) using TfidfVectorizer
Splitting done
Vectorizer done
Random forest done




🏃 View run TfidfVectorizer_(1, 1)_RandomForest at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/942e1beda9a14f98aaeff40e41d96126
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (1, 2) using CountVectorizer
Vectorizer done
Random forest done




🏃 View run CountVectorizer_(1, 2)_RandomForest at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/f6444f267eaf48efb02d530760ebc50a
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (1, 2) using TfidfVectorizer
Vectorizer done
Random forest done




🏃 View run TfidfVectorizer_(1, 2)_RandomForest at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/8712272c79324b83a5b02e7536a7b9ca
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (1, 3) using CountVectorizer
Vectorizer done
Random forest done




🏃 View run CountVectorizer_(1, 3)_RandomForest at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/84c34d93c7aa4e0483d4e9b5ca17950f
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (1, 3) using TfidfVectorizer
Vectorizer done
Random forest done




🏃 View run TfidfVectorizer_(1, 3)_RandomForest at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/2503444fa158479792a621e6a4df113a
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
🏃 View run unequaled-swan-97 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/5dd56316eaea4cb5aa4922d78dbcb964
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2


In [None]:
n_grams = [(1, 1), (2, 2), (3, 3)]