In [25]:
# ! pip install nltk textacy
# ! pip install scikit-learn
# ! pip install mlflow
# ! pip install dagshub

In [31]:
import pandas as pd
import numpy as np
from textacy import preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import mlflow
import dagshub

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def pre_processing():
  df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
  print("Shape of the data frame", df.shape)
  print("Duplicates", df.duplicated().sum())
  print("Null Values:" ,df.isnull().sum())

  print("Dropping the duplicate records.....")
  df.drop_duplicates(inplace=True)

  print("Dropping the null values")
  df.dropna(inplace=True)

  print("Changing data to lower case")
  df['clean_comment'] = df['clean_comment'].str.lower()

  df['length_clean_comment'] = df['clean_comment'].apply(lambda x: len(str(x)))

  print("Strip off the white spaces..")
  df['clean_comment'] = df['clean_comment'].str.replace(r'\s+', ' ', regex=True).str.strip()
  df['length_clean_comment_nowhite_space'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have white spaces:" , df[df['length_clean_comment']!= df['length_clean_comment_nowhite_space']].shape[0])

  print("Removing Html tags....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.html_tags)
  df['length_nowhite_space_htmltag'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have Html tags:" , df[df['length_clean_comment_nowhite_space']!= df['length_nowhite_space_htmltag']].shape[0])

  print("Removing Punctuation....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_htmltag_punctuation'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have punctuation:" , df[df['length_nowhite_space_htmltag']!= df['length_htmltag_punctuation']].shape[0])

  print("Removing brackets....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_punctuation_brackets'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have brackets:" , df[df['length_htmltag_punctuation']!= df['length_punctuation_brackets']].shape[0])

  # Apply the function to the 'clean_comment' column in a single line
  df['clean_comment'] = df['clean_comment'].apply(lambda x: preprocessing.replace.emojis(x, ""))
  df['length_brackets_emojis'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have emojis:" , df[df['length_punctuation_brackets']!= df['length_brackets_emojis']].shape[0])


  # Regular expression to match emojis
  emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # Emoticons
                            "\U0001F300-\U0001F5FF"  # Symbols and Pictographs
                            "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                            "\U0001F700-\U0001F77F"  # Alchemical Symbols
                            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            "\U0001FA00-\U0001FA6F"  # Chess Symbols
                            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            "\U00002702-\U000027B0"  # Dingbats
                            "\U000024C2-\U0001F251"  # Enclosed characters
                            "]", flags=re.UNICODE)

  # Filter out rows where 'clean_comment' contains emojis
  print("Number of non meaning  rows:", df[df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))].shape)
  df = df[~df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))]


  # List of words to keep even if their length is less than 4
  keep_words = ['lol', 'wow', 'wtf', 'fun', 'sad', 'old']

  # Filter out rows with clean_comment length < 4 unless they contain one of the keep_words
  df = df[(df['length_clean_comment'] >= 4) | df['clean_comment'].isin(keep_words)]

  return df[['clean_comment', 'category', 'length_clean_comment']]


# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [9]:
df = pre_processing()
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

Shape of the data frame (37249, 2)
Duplicates 449
Null Values: clean_comment    100
category           0
dtype: int64
Dropping the duplicate records.....
Dropping the null values
Changing data to lower case
Strip off the white spaces..
Number of rows have white spaces: 32407
Removing Html tags....
Number of rows have Html tags: 0
Removing Punctuation....
Number of rows have punctuation: 0
Removing brackets....
Number of rows have brackets: 0
Number of rows have emojis: 55
Number of non meaning  rows: (148, 8)


In [22]:
# Assuming df is already loaded
X = df['clean_comment']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [32]:
dagshub.init(repo_owner='MitVinay', repo_name='youtube_chrome', mlflow=True)
mlflow.set_experiment("Exp1-Feature_eng")

# Start parent run
with mlflow.start_run() as parent_run:
# Define the n-grams range
  n_grams = [(1, 1), (2, 2), (3, 3)]

  # Automate the process for both CountVectorizer and TfidfVectorizer
  vectorizers = {
      "CountVectorizer": CountVectorizer,
      "TfidfVectorizer": TfidfVectorizer
  }

  for ngram in n_grams:
      for vect_name, vect_class in vectorizers.items():
          with mlflow.start_run(nested=True, run_name=f"{ngram} using {vect_name}") as child_run:
            print(f"Testing with n-gram range: {ngram} using {vect_name}")
            # Initialize the vectorizer
            vectorizer = vect_class(ngram_range=ngram, max_features=1000)
            X_train_vect = vectorizer.fit_transform(X_train)
            X_test_vect = vectorizer.transform(X_test)

            # Train the RandomForestClassifier
            rf = RandomForestClassifier(random_state=42)
            rf.fit(X_train_vect, y_train)

            # Predict and evaluate
            y_pred = rf.predict(X_test_vect)
            metrics = classification_report(y_test, y_pred, output_dict=True)

            for label, metrics_dict in metrics.items():
              if label != 'accuracy':  # 'accuracy' is logged separately as a single value
                  for metric, value in metrics_dict.items():
                      mlflow.log_metric(f"{label}_{metric}", value)
              else:
                  # Log the accuracy score separately
                  mlflow.log_metric("accuracy", metrics_dict)
            mlflow.log_param("max_features", 1000)
            mlflow.log_param("ngram_range", ngram)
            mlflow.log_param("vectorizer", vect_name)
            mlflow.sklearn.log_model(rf, "model")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=ae87f971-2218-403d-9a78-a34d380265b6&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=4519d70ace4d7f4f33f287185eb46cfe5c3cf87e644437ca464ab8f9cac5166f




2024/12/11 21:32:38 INFO mlflow.tracking.fluent: Experiment with name 'Exp1-Feature_eng' does not exist. Creating a new experiment.


Testing with n-gram range: (1, 1) using CountVectorizer




🏃 View run (1, 1) using CountVectorizer at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/ea687ad3f82e4a24ad19bdf5b9d7d909
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (1, 1) using TfidfVectorizer




🏃 View run (1, 1) using TfidfVectorizer at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/b5c1eea96af4419aade230b34de93e89
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (2, 2) using CountVectorizer




🏃 View run (2, 2) using CountVectorizer at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/df7e7b34815c450aae300ca068557414
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (2, 2) using TfidfVectorizer




🏃 View run (2, 2) using TfidfVectorizer at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/15ff7a5e5e42455291943ba4bd0b1b73
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (3, 3) using CountVectorizer




🏃 View run (3, 3) using CountVectorizer at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/b97079f5cb2c463eb99f54ae6785ce2f
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
Testing with n-gram range: (3, 3) using TfidfVectorizer




🏃 View run (3, 3) using TfidfVectorizer at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/a1c599d3ee774b4b92c1daa9c638b1f5
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
🏃 View run hilarious-bass-638 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2/runs/da7ec10d59914093a75e005ccd1d662a
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/2
