In [23]:
# Import necessary libraries and download NLTK resources
import os
import re
import string
import pandas as pd
import numpy as np
import nltk

# Download necessary NLTK resources (only needed once)
nltk.download('punkt')       # Standard tokenizer resource
nltk.download('punkt_tab')   # Additional resource required by some tokenizers
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Scikit-learn imports for vectorization, modeling, and evaluation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Section C: Data Loading and Preprocessing (Code)

In [24]:
# Define the path to your CSV file (update the path as needed)
csv_file_path = "../Data/processed/noon_filtered_comments_with_sentiment_nltk.csv"

# Check if the file exists
if not os.path.exists(csv_file_path):
    raise FileNotFoundError(f"CSV file not found at {csv_file_path}. Please check the path.")

# Load the CSV file into a Pandas DataFrame
try:
    df = pd.read_csv(csv_file_path)
except Exception as e:
    raise Exception(f"Error loading CSV file: {e}")

# Remove any extra whitespace from column names
df.columns = df.columns.str.strip()

# Print out the column names for inspection
print("Columns in the DataFrame:")
print(df.columns)

# Verify that required columns exist: 'comment_text' and 'Target Value'
if 'comment_text' not in df.columns or 'Target Value' not in df.columns:
    raise KeyError("The CSV file must contain both 'comment_text' and 'Target Value' columns.")

# Display the first few rows of the DataFrame
print("\nFirst few rows of the DataFrame:")
print(df.head())

# Preprocess the "Target Value" column to boolean
df['Target Value'] = df['Target Value'].apply(lambda x: True if str(x).strip().upper() == "TRUE" else False)

Columns in the DataFrame:
Index(['post_id', 'post_title', 'comment_id', 'author', 'comment_text',
       'score', 'created_utc', 'is_submitter', 'contains_noon_mention', 'type',
       'Sentiment', 'Positive', 'Negative', 'Neutral', 'Overall',
       'Sentiment_Nltk', 'Target Value', 'Unnamed: 17'],
      dtype='object')

First few rows of the DataFrame:
   post_id                                         post_title comment_id  \
0  1ec7r4l  My bank just woke me up at 2:00 in the morning...    ley1hzs   
1  1ec7r4l  My bank just woke me up at 2:00 in the morning...    lf0s1re   
2  1ec7r4l  My bank just woke me up at 2:00 in the morning...    lf06mu0   
3  1ec7r4l  My bank just woke me up at 2:00 in the morning...    lf1m0tu   
4  1ec7r4l  My bank just woke me up at 2:00 in the morning...    lfartnt   

                author                                       comment_text  \
0          hanihaneefa  Noon doesn't ask for otp when using cards... H...   
1       SnooGuavas4756  UPDATE: 

Text Analytics Pipeline – Preprocessing & Feature Extraction (Code)

Define a Custom Tokenizer (Code)

In [25]:
# Initialize stopwords and the Porter Stemmer
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def custom_tokenizer(text):
    """
    Tokenize text by:
      - Converting to lowercase
      - Removing punctuation
      - Tokenizing using NLTK's word_tokenize
      - Removing English stopwords
      - Applying stemming using PorterStemmer
    Returns:
      A list of processed tokens.
    """
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation using regex
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Apply stemming
    tokens = [ps.stem(token) for token in tokens]
    
    return tokens

Prepare Features and Split Data (Code)

In [26]:
# Create features (X) and labels (y)
X = df['comment_text']
y = df['Target Value']

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Build the Pipeline and Hyperparameter Tuning (Code)

In [28]:
import warnings

# Suppress the specific warning about token_pattern not being used
warnings.filterwarnings(
    "ignore",
    message="The parameter 'token_pattern' will not be used since 'tokenizer' is not None"
)

# Build the text analytics pipeline with TF–IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        tokenizer=custom_tokenizer,   # Use our custom tokenizer
        ngram_range=(1, 2),           # Experiment with unigrams and bigrams (you can try (1,1) or (1,3))
        min_df=5,                     # Ignore terms that appear in fewer than 5 documents
        stop_words=None               # Stopwords are already removed in our custom tokenizer
    )),
    ('clf', LogisticRegression(max_iter=200))
])

# Define the grid of parameters for hyperparameter tuning
param_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

# Set up GridSearchCV
grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

print("\nBest cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)




Best cross-validation score: 0.94
Best parameters:  {'clf__C': 0.001, 'tfidf__ngram_range': (1, 1)}


Model Evaluation

In [29]:
# Predict on the test set using the best found model
y_pred = grid.predict(X_test)

# Print a detailed classification report and accuracy
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test Accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))


Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00         5
        True       0.95      1.00      0.98        99

    accuracy                           0.95       104
   macro avg       0.48      0.50      0.49       104
weighted avg       0.91      0.95      0.93       104

Test Accuracy: 0.95


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Visualization and Insights (Markdown & Code)

In [30]:
# Example: Generate a word cloud for positive comments
# (Make sure to install the wordcloud package: pip install wordcloud)
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Filter out positive comments from the original DataFrame (or use a different condition)
positive_comments = df[df['Sentiment_Nltk'] == "Positive"]['comment_text']

# Combine all positive comments into one large string
positive_text = " ".join(positive_comments.astype(str).tolist())

# Create a word cloud object and generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_text)

# Display the generated image:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for Positive Comments")
plt.show()

ModuleNotFoundError: No module named 'wordcloud'