# NLP Programming Assignment 2

### Jirapat Sereerat 662115004

In [None]:
# !pip install textstat
import pandas as pd
import numpy as np
import re
import time
import nltk
import textstat
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

file_path = 'Sentiment Analysis Dataset.csv'

try:
    df = pd.read_csv(file_path, encoding='latin-1')
except:
    df = pd.read_csv(file_path, encoding='latin-1', error_bad_lines=False)

print(f"Dataset Loaded. Rows: {len(df)}")

# Start Timer
start_time = time.time()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jirapat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jirapat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jirapat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jirapat\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jirapat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Dataset Loaded. Rows: 1048575


In [14]:
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [None]:
# ---------------------------------------------------------
# STEP 1: TEXT CLEANING
# ---------------------------------------------------------

# Regex Patterns
emoticon_pattern = r'(?::|;|=)(?:-)?(?:\)|\(|D|P|\|)'
phone_pattern = r'\(?\b[0-9]{3}\)?[-. ]?[0-9]{3}[-. ]?[0-9]{4}\b'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
account_pattern = r'\b\d{8,16}\b'

stats = {
    'emoticon_removed': 0,
    'special_char_removed': 0,
    'phone_count': 0,
    'email_count': 0,
    'account_count': 0
}

def clean_text_step1(text):
    text = str(text) 
    
    # 1.6 PII Counting (Before cleaning)
    stats['phone_count'] += len(re.findall(phone_pattern, text))
    stats['email_count'] += len(re.findall(email_pattern, text))
    stats['account_count'] += len(re.findall(account_pattern, text))
    
    # 1.5 Emoticon Counting & Removal
    emoticons = re.findall(emoticon_pattern, text)
    stats['emoticon_removed'] += len(emoticons)
    # Remove emoticons 
    text = re.sub(emoticon_pattern, '', text)
    
    # Track length before special char removal
    len_before = len(text)
    
    # Remove Special Characters & Numbers
    text = re.sub(r'[^a-zA-Z\s.,!?\'"]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Calculate removed special chars
    stats['special_char_removed'] += (len_before - len(text))
    
    return text

df['cleaned_text'] = df['SentimentText'].apply(clean_text_step1)
print("Step 1 (Cleaning) Complete.")
df[['SentimentText', 'cleaned_text']].head()

Step 1 (Cleaning) Complete.


Unnamed: 0,SentimentText,cleaned_text
0,is so sad for my APL frie...,is so sad for my APL friend.............
1,I missed the New Moon trail...,I missed the New Moon trailer...
2,omg its already 7:30 :O,omg its already O
3,.. Omgaga. Im sooo im gunna CRy. I'...,.. Omgaga. Im sooo im gunna CRy. I've been at ...
4,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me!!! TT


In [None]:
# ---------------------------------------------------------
# STEP 2: TOKENIZATION
# ---------------------------------------------------------

def step2_tokenize(text):
    # Split into sentences
    sentences = sent_tokenize(text)
    # Split into words (for each sentence)
    words = word_tokenize(text)
    return sentences, words

# We will create a temp column to hold the tuple (sentences, words)
token_data = df['cleaned_text'].apply(step2_tokenize)

df['sentences'] = token_data.apply(lambda x: x[0])
df['raw_tokens'] = token_data.apply(lambda x: x[1])

print("Step 2 (Tokenization) Complete.")
df[['cleaned_text', 'raw_tokens']].head()

Step 2 (Tokenization) Complete.


Unnamed: 0,cleaned_text,raw_tokens
0,is so sad for my APL friend.............,"[is, so, sad, for, my, APL, friend, ............."
1,I missed the New Moon trailer...,"[I, missed, the, New, Moon, trailer, ...]"
2,omg its already O,"[omg, its, already, O]"
3,.. Omgaga. Im sooo im gunna CRy. I've been at ...,"[.., Omgaga, ., Im, sooo, im, gunna, CRy, ., I..."
4,i think mi bf is cheating on me!!! TT,"[i, think, mi, bf, is, cheating, on, me, !, !,..."


In [None]:
# ---------------------------------------------------------
# STEP 3: LOWERCASING & STOP WORDS
# ---------------------------------------------------------
stop_words = set(stopwords.words('english'))

# Counter for stats
stats['lowercase_count'] = 0
stats['stop_word_removed'] = 0

def step3_processing(tokens_list):
    processed_tokens = []
    
    for word in tokens_list:
        # Lowercasing Count (Count uppercase chars before converting)
        stats['lowercase_count'] += sum(1 for c in word if c.isupper())
        
        # Convert to lowercase
        lower_word = word.lower()
        
        # Stop Word Removal
        if lower_word not in stop_words and lower_word.isalnum():
            processed_tokens.append(lower_word)
        else:
            if lower_word in stop_words:
                stats['stop_word_removed'] += 1
                
    return processed_tokens

df['tokens_step3'] = df['raw_tokens'].apply(step3_processing)

print("Step 3 (Lowercasing & Stopwords) Complete.")
df[['tokens_step3']].head()

Step 3 (Lowercasing & Stopwords) Complete.


Unnamed: 0,tokens_step3
0,"[sad, apl, friend]"
1,"[missed, new, moon, trailer]"
2,"[omg, already]"
3,"[omgaga, im, sooo, im, gunna, cry, dentist, si..."
4,"[think, mi, bf, cheating, tt]"


In [None]:
# ---------------------------------------------------------
# STEP 4: STEMMING & LEMMATIZATION
# ---------------------------------------------------------
ps = PorterStemmer()
wnl = WordNetLemmatizer()

def step4_stem_lemma(tokens_list):
    # Apply Lemmatization THEN Stemming
    return [ps.stem(wnl.lemmatize(w)) for w in tokens_list]

df['final_tokens'] = df['tokens_step3'].apply(step4_stem_lemma)

print("Step 4 (Stemming & Lemmatization) Complete.")
df[['tokens_step3', 'final_tokens']].head()

Step 4 (Stemming & Lemmatization) Complete.


Unnamed: 0,tokens_step3,final_tokens
0,"[sad, apl, friend]","[sad, apl, friend]"
1,"[missed, new, moon, trailer]","[miss, new, moon, trailer]"
2,"[omg, already]","[omg, alreadi]"
3,"[omgaga, im, sooo, im, gunna, cry, dentist, si...","[omgaga, im, sooo, im, gunna, cri, dentist, si..."
4,"[think, mi, bf, cheating, tt]","[think, mi, bf, cheat, tt]"


In [24]:
# ---------------------------------------------------------
# STATISTICS & METRICS
# ---------------------------------------------------------

def calculate_row_stats(row):

    # Before
    sents_before = sent_tokenize(str(row['SentimentText']))
    words_before = [w for w in word_tokenize(str(row['SentimentText'])) if w.isalnum()]
    
    # After (Using final tokens and cleaned sentences count)
    sents_after = row['sentences']
    words_after = row['final_tokens']
    
    # Min/Max Sent Len Before
    sent_lens_b = [len(word_tokenize(s)) for s in sents_before]
    min_sent_b = min(sent_lens_b) if sent_lens_b else 0
    max_sent_b = max(sent_lens_b) if sent_lens_b else 0
    
    # Min/Max Sent Len After
    avg_len_after = len(words_after) / len(sents_after) if sents_after else 0
    
    # 2.1 Readability
    flesch_before = textstat.flesch_reading_ease(str(row['SentimentText']))
    flesch_after = textstat.flesch_reading_ease(" ".join(words_after))
    
    # 2.2 Lexical Diversity
    lex_div_before = len(set(words_before)) / len(words_before) if words_before else 0
    lex_div_after = len(set(words_after)) / len(words_after) if words_after else 0
    
    return pd.Series([
        len(words_before), len(words_after),       # Word Counts
        len(sents_before), len(sents_after),       # Sentence Counts
        max([len(w) for w in words_before]) if words_before else 0, # Max Word Len Before
        max([len(w) for w in words_after]) if words_after else 0,   # Max Word Len After
        min_sent_b, max_sent_b,                    # Min/Max Sent Len Before
        avg_len_after,                             # Avg Sent Len After
        flesch_before, flesch_after,               # Readability
        lex_div_before, lex_div_after              # Lexical Diversity
    ])

print("Calculating Metrics...")
metrics_df = df.apply(calculate_row_stats, axis=1)
metrics_df.columns = [
    'words_b', 'words_a', 'sents_b', 'sents_a', 
    'max_word_b', 'max_word_a', 'min_sent_b', 'max_sent_b', 'avg_sent_a',
    'flesch_b', 'flesch_a', 'lex_div_b', 'lex_div_a'
]

# 1.1 Average Sentence Length
avg_sent_len_before = metrics_df['words_b'].sum() / metrics_df['sents_b'].sum()
avg_sent_len_after = metrics_df['words_a'].sum() / metrics_df['sents_a'].sum()

# 1.3 Vocabulary Size 
vocab_before = set()
df['SentimentText'].apply(lambda x: vocab_before.update(word_tokenize(str(x))))
vocab_after = set()
df['final_tokens'].apply(lambda x: vocab_after.update(x))

end_time = time.time()
runtime = end_time - start_time

print("\n" + "="*40)
print("           FINAL REPORT")
print("="*40)
print(f"1.1 Avg Sentence Length:")
print(f"    Before: {avg_sent_len_before:.2f}")
print(f"    After:  {avg_sent_len_after:.2f}")

print(f"\n1.2 Word and Sentence Count:")
print(f"    Words:     {metrics_df['words_b'].sum()} (Before) -> {metrics_df['words_a'].sum()} (After)")
print(f"    Sentences: {metrics_df['sents_b'].sum()} (Before) -> {metrics_df['sents_a'].sum()} (After)")

print(f"\n1.3 Vocabulary Size:")
print(f"    Before: {len(vocab_before)}")
print(f"    After:  {len(vocab_after)}")

print(f"\n1.4 Length Stats:")
print(f"    Max Word Length: {metrics_df['max_word_b'].max()} (Before) -> {metrics_df['max_word_a'].max()} (After)")
print(f"    Avg Min Sentence Length (Before): {metrics_df['min_sent_b'].mean():.2f}")
print(f"    Avg Max Sentence Length (Before): {metrics_df['max_sent_b'].mean():.2f}")

print(f"\n1.5 Removal Counts:")
print(f"    Emoticons Removed:     {stats['emoticon_removed']}")
print(f"    Stop Words Removed:    {stats['stop_word_removed']}")
print(f"    Special Chars Removed: {stats['special_char_removed']}")
print(f"    Lowercase Conversions: {stats['lowercase_count']}")

print(f"\n1.6 PII Counts:")
print(f"    Emails: {stats['email_count']}")
print(f"    Phones: {stats['phone_count']}")
print(f"    Accounts: {stats['account_count']}")

print(f"\n2. Automated Metrics:")
print(f"    Readability (Flesch) Before: {metrics_df['flesch_b'].mean():.2f}")
print(f"    Readability (Flesch) After:  {metrics_df['flesch_a'].mean():.2f}")
print(f"    Lexical Diversity Before:    {metrics_df['lex_div_b'].mean():.2f}")
print(f"    Lexical Diversity After:     {metrics_df['lex_div_a'].mean():.2f}")

print(f"\n3. Runtime:")
print(f"    Total Time: {runtime:.4f} seconds")

Calculating Metrics...

           FINAL REPORT
1.1 Avg Sentence Length:
    Before: 7.63
    After:  4.32

1.2 Word and Sentence Count:
    Words:     13833637.0 (Before) -> 7855752.0 (After)
    Sentences: 1814099.0 (Before) -> 1817565.0 (After)

1.3 Vocabulary Size:
    Before: 696811
    After:  479711

1.4 Length Stats:
    Max Word Length: 117.0 (Before) -> 123.0 (After)
    Avg Min Sentence Length (Before): 9.11
    Avg Max Sentence Length (Before): 12.84

1.5 Removal Counts:
    Emoticons Removed:     11753
    Stop Words Removed:    5883938
    Special Chars Removed: 3606557
    Lowercase Conversions: 3452053

1.6 PII Counts:
    Emails: 417
    Phones: 363
    Accounts: 550

2. Automated Metrics:
    Readability (Flesch) Before: 77.71
    Readability (Flesch) After:  72.42
    Lexical Diversity Before:    0.96
    Lexical Diversity After:     0.98

3. Runtime:
    Total Time: 867.6703 seconds


In [25]:
# ---------------------------------------------------------
# SAVE PROCESSED DATA
# ---------------------------------------------------------

# Create a final dataframe
output_df = pd.DataFrame({
    'original_text': df['SentimentText'],
    'cleaned_text': df['cleaned_text'],
    'final_tokens': df['final_tokens'].apply(lambda x: " ".join(x)),
    'readability_original': metrics_df['flesch_b'],
    'readability_cleaned': metrics_df['flesch_a']
})

output_filename = 'Processed_Sentiment_Dataset.csv'
output_df.to_csv(output_filename, index=False)

print(f"\n[DONE] Processed data saved to '{output_filename}'")


[DONE] Processed data saved to 'Processed_Sentiment_Dataset.csv'
