In [6]:
# ! pip install nltk textacy
# ! pip install scikit-learn
# ! pip install mlflow
# ! pip install dagshub
# ! pip install imbalanced-learn



In [23]:
import pandas as pd
import numpy as np
from textacy import preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import mlflow
import dagshub
from collections import Counter
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import RandomUnderSampler

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def pre_processing():
  df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
  print("Shape of the data frame", df.shape)
  print("Duplicates", df.duplicated().sum())
  print("Null Values:" ,df.isnull().sum())

  print("Dropping the duplicate records.....")
  df.drop_duplicates(inplace=True)

  print("Dropping the null values")
  df.dropna(inplace=True)

  print("Changing data to lower case")
  df['clean_comment'] = df['clean_comment'].str.lower()

  df['length_clean_comment'] = df['clean_comment'].apply(lambda x: len(str(x)))

  print("Strip off the white spaces..")
  df['clean_comment'] = df['clean_comment'].str.replace(r'\s+', ' ', regex=True).str.strip()
  df['length_clean_comment_nowhite_space'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have white spaces:" , df[df['length_clean_comment']!= df['length_clean_comment_nowhite_space']].shape[0])

  print("Removing Html tags....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.html_tags)
  df['length_nowhite_space_htmltag'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have Html tags:" , df[df['length_clean_comment_nowhite_space']!= df['length_nowhite_space_htmltag']].shape[0])

  print("Removing Punctuation....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_htmltag_punctuation'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have punctuation:" , df[df['length_nowhite_space_htmltag']!= df['length_htmltag_punctuation']].shape[0])

  print("Removing brackets....")
  df['clean_comment'] = df['clean_comment'].apply(preprocessing.remove.punctuation)
  df['length_punctuation_brackets'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have brackets:" , df[df['length_htmltag_punctuation']!= df['length_punctuation_brackets']].shape[0])

  # Apply the function to the 'clean_comment' column in a single line
  df['clean_comment'] = df['clean_comment'].apply(lambda x: preprocessing.replace.emojis(x, ""))
  df['length_brackets_emojis'] = df['clean_comment'].apply(lambda x: len(str(x)))
  print("Number of rows have emojis:" , df[df['length_punctuation_brackets']!= df['length_brackets_emojis']].shape[0])


  # Regular expression to match emojis
  emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # Emoticons
                            "\U0001F300-\U0001F5FF"  # Symbols and Pictographs
                            "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                            "\U0001F700-\U0001F77F"  # Alchemical Symbols
                            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            "\U0001FA00-\U0001FA6F"  # Chess Symbols
                            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            "\U00002702-\U000027B0"  # Dingbats
                            "\U000024C2-\U0001F251"  # Enclosed characters
                            "]", flags=re.UNICODE)

  # Filter out rows where 'clean_comment' contains emojis
  print("Number of non meaning  rows:", df[df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))].shape)
  df = df[~df['clean_comment'].apply(lambda x: bool(emoji_pattern.search(x)))]


  # List of words to keep even if their length is less than 4
  keep_words = ['lol', 'wow', 'wtf', 'fun', 'sad', 'old']

  # Filter out rows with clean_comment length < 4 unless they contain one of the keep_words
  df = df[(df['length_clean_comment'] >= 4) | df['clean_comment'].isin(keep_words)]

  return df[['clean_comment', 'category', 'length_clean_comment']]


# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [12]:
df = pre_processing()
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)
# Assuming df is already loaded
X = df['clean_comment']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
# Initialize the vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

Shape of the data frame (37249, 2)
Duplicates 449
Null Values: clean_comment    100
category           0
dtype: int64
Dropping the duplicate records.....
Dropping the null values
Changing data to lower case
Strip off the white spaces..
Number of rows have white spaces: 32407
Removing Html tags....
Number of rows have Html tags: 0
Removing Punctuation....
Number of rows have punctuation: 0
Removing brackets....
Number of rows have brackets: 0
Number of rows have emojis: 55
Number of non meaning  rows: (148, 8)


# Imabalance Learnig
1. UnderSampling
2. OverSampling
3. SMote ENN
4. ADASYN
5 Class Weight

In [15]:
print(sorted(Counter(y_train).items()))

[(-1, 5759), (0, 8858), (1, 11014)]


In [17]:
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(-1, 5759), (0, 5759), (1, 5759)]


In [20]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_resampled, y_resampled)

# Predict and evaluate
y_pred = rf.predict(X_test)
metrics = classification_report(y_test, y_pred, output_dict=True)
print(metrics)

{'-1': {'precision': 0.7205580631924497, 'recall': 0.7069243156199678, 'f1-score': 0.7136760820971347, 'support': 2484.0}, '0': {'precision': 0.7908825868009541, 'recall': 0.7904635761589404, 'f1-score': 0.7906730259671436, 'support': 3775.0}, '1': {'precision': 0.7690052356020942, 'recall': 0.7769784172661871, 'f1-score': 0.7729712661825071, 'support': 4726.0}, 'accuracy': 0.765771506599909, 'macro avg': {'precision': 0.7601486285318327, 'recall': 0.7581221030150318, 'f1-score': 0.7591067914155952, 'support': 10985.0}, 'weighted avg': {'precision': 0.7655682055165357, 'recall': 0.765771506599909, 'f1-score': 0.7656462689971578, 'support': 10985.0}}


In [25]:
undersampling_alo = {"ClusterCentroids": ClusterCentroids(random_state=0),
                     "CondensedNearestNeighbour": CondensedNearestNeighbour(random_state=0),
                     "TomekLinks": TomekLinks(),
                     "RandomUnderSampler": RandomUnderSampler(random_state=0),
                     "OneSidedSelection": OneSidedSelection(random_state=0),
                     "CondensedNearestNeighbour": CondensedNearestNeighbour(random_state=0)
                     }


In [None]:
dagshub.init(repo_owner='MitVinay', repo_name='youtube_chrome', mlflow=True)
mlflow.set_experiment("Exp3-Undersampling Technique")

# Start parent run
with mlflow.start_run() as parent_run:
    # Define the n-grams range and max_features values

    # Automate the process for TfidfVectorizer
    for undersample, algo in undersampling_alo.items():
        with mlflow.start_run(nested=True, run_name=f"{undersample} TFIDF , 1gram max_features=5000") as child_run:
            print(f"undersample: {undersample}")

            X_resampled, y_resampled = algo.fit_resample(X_train, y_train)
            print(sorted(Counter(y_resampled).items()))
            # Train the RandomForestClassifier
            rf = RandomForestClassifier(random_state=42)
            rf.fit(X_resampled, y_resampled)

            # Predict and evaluate
            y_pred = rf.predict(X_test)
            metrics = classification_report(y_test, y_pred, output_dict=True)

            for label, metrics_dict in metrics.items():
                if label != 'accuracy':  # 'accuracy' is logged separately as a single value
                    for metric, value in metrics_dict.items():
                        mlflow.log_metric(f"{label}_{metric}", value)
                else:
                    # Log the accuracy score separately
                    mlflow.log_metric("accuracy", metrics_dict)

            mlflow.log_param("max_features", 5000)
            mlflow.log_param("ngram_range", 1)
            mlflow.log_param("vectorizer", "TfidfVectorizer")
            mlflow.log_param("Under Sampling Technique", algo)
            mlflow.sklearn.log_model(rf, "model")


Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=8af0883c-9a71-4c8d-834a-17f136a0710c&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=7e85f2f7f68e5be98a59af5344dfba88479d32f7540b6105ac0b2da3561c2373




2024/12/13 15:41:22 INFO mlflow.tracking.fluent: Experiment with name 'Exp3-Undersampling Technique' does not exist. Creating a new experiment.


undersample: ClusterCentroids
[(-1, 5759), (0, 5759), (1, 5759)]




🏃 View run ClusterCentroids TFIDF , 1gram max_features=5000 at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/4/runs/62ae70686fb84374b55bd4a9145a0279
🧪 View experiment at: https://dagshub.com/MitVinay/youtube_chrome.mlflow/#/experiments/4
undersample: CondensedNearestNeighbour
