In [1]:
import re
import string
import nltk

# Download required NLTK data packages (only needs to be done once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import pandas as pd
import numpy as np

# Scikit-learn imports for vectorization, classification and evaluation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hariharakumarrathinar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Create instances of the stemmer and lemmatizer (choose one based on your preference)
ps = PorterStemmer()
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def custom_tokenizer(text, use_stemming=True, use_lemmatization=False):
    """
    Process text by lowercasing, removing punctuation, tokenizing,
    removing stopwords and applying stemming or lemmatization.
    
    Parameters:
        text (str): The input text.
        use_stemming (bool): If True, apply stemming.
        use_lemmatization (bool): If True, apply lemmatization.
                                  (If both are True, stemming is applied.)
    
    Returns:
        List[str]: A list of processed tokens.
    """
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation using regex (this removes punctuation characters)
    text = re.sub(r'[' + string.punctuation + ']', '', text)
    
    # Tokenize the text using NLTK
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    
    # Apply stemming or lemmatization (if desired)
    if use_stemming:
        tokens = [ps.stem(t) for t in tokens]
    elif use_lemmatization:
        tokens = [wnl.lemmatize(t) for t in tokens]
    
    return tokens

In [3]:
# Define the path to your CSV file (update the path as needed)
csv_file_path = "../Data/processed/Final_Noon_Datasets.csv"

# Load the CSV file into a Pandas DataFrame
try:
    df = pd.read_csv(csv_file_path)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    exit()

# Check for the expected columns
if 'comment_text' not in df.columns or 'Sentiment' not in df.columns:
    print("The CSV file must contain both 'comment_text' and 'Sentiment' columns.")
    exit()

# Optionally, display a few rows to verify the data
print(df.head())

The CSV file must contain both 'comment_text' and 'Sentiment' columns.
   post_id                                     post_title comment_id  \
0  1ibzgq5  Dubai to Riyadh - Which Road to Take? 1 or 2?    m9mjkdv   
1  1ibzgq5  Dubai to Riyadh - Which Road to Take? 1 or 2?    m9mgv03   
2  1ibzgq5  Dubai to Riyadh - Which Road to Take? 1 or 2?    m9mfqls   
3  1i9vydl                   Warranty from Noon or Amazon    m9lj27y   
4  1i9vydl                   Warranty from Noon or Amazon    m9lhuq1   

             author                                       comment_text  score  \
0    99DragonMaster                          will reach border by noon      2   
1   AgileBadger5988  Be careful of blowing dust during late morning...      9   
2     jamesdongdong  Apart from boring things, you can managed. How...      1   
3  Agitated-Fox2818  You are looking at a tag in noon app saying 1 ...      2   
4             m2bop  Are you sure? There's nothing indicating that ...      1   

         