In [None]:
! pip install nltk textacy
! pip install scikit-learn
! pip install mlflow
! pip install dagshub

Collecting textacy
  Downloading textacy-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Collecting cytoolz>=0.10.1 (from textacy)
  Downloading cytoolz-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting floret~=0.10.0 (from textacy)
  Downloading floret-0.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting pyphen>=0.10.0 (from textacy)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textacy-0.13.0-py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cytoolz-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading floret-0.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from textacy import preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import mlflow
import dagshub

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def pre_processing():
  df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
  print("Shape of the data frame", df.shape)
  print("Duplicates", df.duplicated().sum())
  print("Null Values:" ,df.isnull().sum())

  print("Dropping the duplicate records.....")
  df.drop_duplicates(inplace=True)

  print("Dropping the null values")
  df.dropna(inplace=True)

  print("Changing data to lower case")
  df['clean_comment'] = df['clean_comment'].str.lower()

  df['length_clean_comment'] = df['clean_comment'].apply(lambda x: len(str(x)))

  print("Strip off the white spaces..")
  df['clean_comment'] = df['clean_comment'].str.replace(r'\s+', ' ', regex=True).str.strip()
  df['length_clean_comment_nowhite_space'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have white spaces:" , df[df['length_clean_comment']!= df['length_clean_comment_nowhite_space']].shape[0])

  print("Removing Html tags....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.html_tags)
  df['length_nowhite_space_htmltag'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have Html tags:" , df[df['length_clean_comment_nowhite_space']!= df['length_nowhite_space_htmltag']].shape[0])

  print("Removing Punctuation....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_htmltag_punctuation'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have punctuation:" , df[df['length_nowhite_space_htmltag']!= df['length_htmltag_punctuation']].shape[0])

  print("Removing brackets....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_punctuation_brackets'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have brackets:" , df[df['length_htmltag_punctuation']!= df['length_punctuation_brackets']].shape[0])

  # Apply the function to the 'clean_comment' column in a single line
  df['clean_comment'] = df['clean_comment'].apply(lambda x: preprocessing.replace.emojis(x, ""))
  df['length_brackets_emojis'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have emojis:" , df[df['length_punctuation_brackets']!= df['length_brackets_emojis']].shape[0])


  # Regular expression to match emojis
  emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # Emoticons
                            "\U0001F300-\U0001F5FF"  # Symbols and Pictographs
                            "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                            "\U0001F700-\U0001F77F"  # Alchemical Symbols
                            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            "\U0001FA00-\U0001FA6F"  # Chess Symbols
                            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            "\U00002702-\U000027B0"  # Dingbats
                            "\U000024C2-\U0001F251"  # Enclosed characters
                            "]", flags=re.UNICODE)

  # Filter out rows where 'clean_comment' contains emojis
  print("Number of non meaning  rows:", df[df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))].shape)
  df = df[~df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))]


  # List of words to keep even if their length is less than 4
  keep_words = ['lol', 'wow', 'wtf', 'fun', 'sad', 'old']

  # Filter out rows with clean_comment length < 4 unless they contain one of the keep_words
  df = df[(df['length_clean_comment'] >= 4) | df['clean_comment'].isin(keep_words)]

  return df[['clean_comment', 'category', 'length_clean_comment']]


# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [None]:
df = pre_processing()
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

Shape of the data frame (37249, 2)
Duplicates 449
Null Values: clean_comment    100
category           0
dtype: int64
Dropping the duplicate records.....
Dropping the null values
Changing data to lower case
Strip off the white spaces..
Number of rows have white spaces: 32407
Removing Html tags....
Number of rows have Html tags: 0
Removing Punctuation....
Number of rows have punctuation: 0
Removing brackets....
Number of rows have brackets: 0
Number of rows have emojis: 55
Number of non meaning  rows: (148, 8)


In [None]:
# Assuming df is already loaded
X = df['clean_comment']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [None]:
dagshub.init(repo_owner='MitVinay', repo_name='youtube_chrome', mlflow=True)
mlflow.set_experiment("Exp2-max-features")

# Start parent run
with mlflow.start_run() as parent_run:
    # Define the n-grams range and max_features values
    n_grams = [(1, 1)]
    max_features_list = [5000, 6000, 7000, 8000]

    # Automate the process for TfidfVectorizer
    for max_features in max_features_list:
        for ngram in n_grams:
            with mlflow.start_run(nested=True, run_name=f"TFIDF {ngram} max_features={max_features}") as child_run:
                print(f"Testing with n-gram range: {ngram} and max_features={max_features} using TfidfVectorizer")

                # Initialize the vectorizer
                vectorizer = TfidfVectorizer(ngram_range=ngram, max_features=max_features)
                X_train_vect = vectorizer.fit_transform(X_train)
                X_test_vect = vectorizer.transform(X_test)

                # Train the RandomForestClassifier
                rf = RandomForestClassifier(random_state=42)
                rf.fit(X_train_vect, y_train)

                # Predict and evaluate
                y_pred = rf.predict(X_test_vect)
                metrics = classification_report(y_test, y_pred, output_dict=True)

                for label, metrics_dict in metrics.items():
                    if label != 'accuracy':  # 'accuracy' is logged separately as a single value
                        for metric, value in metrics_dict.items():
                            mlflow.log_metric(f"{label}_{metric}", value)
                    else:
                        # Log the accuracy score separately
                        mlflow.log_metric("accuracy", metrics_dict)

                mlflow.log_param("max_features", max_features)
                mlflow.log_param("ngram_range", ngram)
                mlflow.log_param("vectorizer", "TfidfVectorizer")
                mlflow.sklearn.log_model(rf, "model")


Testing with n-gram range: (1, 1) and max_features=5000 using TfidfVectorizer




🏃 View run TFIDF (1, 1) max_features=5000 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3/runs/6fefc943d22543e09b5e4c9354555b47
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3
Testing with n-gram range: (1, 1) and max_features=6000 using TfidfVectorizer




🏃 View run TFIDF (1, 1) max_features=6000 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3/runs/e427918b41174ad8b3e2f7ae1d177d1d
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3
Testing with n-gram range: (1, 1) and max_features=7000 using TfidfVectorizer




🏃 View run TFIDF (1, 1) max_features=7000 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3/runs/625a767ed32a4c008d31916f20cff396
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3
Testing with n-gram range: (1, 1) and max_features=8000 using TfidfVectorizer




🏃 View run TFIDF (1, 1) max_features=8000 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3/runs/06dafa2f31924b1dbbde8d00587816e8
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3
🏃 View run invincible-perch-130 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3/runs/7e6ce1a74c8242cf99d283fadf2d682e
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/3
