In [1]:
pip install emoji

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install autocorrect

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required NLTK resources (PDF) + punkt (needed by your environment)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('punkt')  # needed because your error said punkt not found

# Initialize tools (PDF)
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Slang dictionary (PDF)
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing"
}

# Contractions dictionary (PDF)
contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

# === Functions (PDF) ===

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def replace_slang(text):
    escaped_slang_words = []
    for word in slang_dict.keys():
        escaped_word = re.escape(word)
        escaped_slang_words.append(escaped_word)

    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'

    def replace_match(match):
        slang_word = match.group(0)
        return slang_dict[slang_word.lower()]

    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)
    return replaced_text

# Build regex for contractions (PDF)
escaped_contractions = []
for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)
    escaped_contractions.append(escaped_contraction)

joined_contractions = "|".join(escaped_contractions)
contractions_pattern = r'\b(' + joined_contractions + r')\b'
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE)

def replace_contractions(text):
    def replace_match(match):
        matched_word = match.group(0)
        lower_matched_word = matched_word.lower()
        expanded_form = contractions_dict[lower_matched_word]
        return expanded_form

    expanded_text = compiled_pattern.sub(replace_match, text)
    return expanded_text

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def correct_spelling(text):
    return spell(text)

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words)

def tokenize_text(text):
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

def preprocess_text(text):
    text = text.lower()
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_emojis(text)
    text = replace_slang(text)
    text = replace_contractions(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = correct_spelling(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    text = tokenize_text(text)
    return text

# === Exercise Step 1: identify issues (simple inspection) ===
# Load UNITENReview.csv (if encoding problem, use latin1 like you did before)
df = pd.read_csv("UNITENReview.csv", encoding="latin1")
pd.set_option('display.max_colwidth', None)

print("Columns:", df.columns)
print(df.head())

# These checks help you "identify issues" (URLs/HTML/emojis/numbers) in Review column
review_col = "Review"  # as stated in exercise
print("Null count:", df[review_col].isna().sum())
print("Example with URL:", df[df[review_col].astype(str).str.contains(r'http|www', na=False)].head(1)[review_col])
print("Example with HTML:", df[df[review_col].astype(str).str.contains(r'<.*?>', na=False)].head(1)[review_col])
print("Example with number:", df[df[review_col].astype(str).str.contains(r'\d', na=False)].head(1)[review_col])

# === Exercise Step 2: apply preprocessing ===
df["processed"] = df[review_col].astype(str).apply(preprocess_text)

# === Exercise Step 3: save to csv ===
df.to_csv("Processed_UNITENReview.csv", index=False)
print(df[[review_col, "processed"]].head())
print("Saved: Processed_UNITENReview.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/d8db6fe4-06fa-44a2-a998-
[nltk_data]     5bb20b593b23/nltk_data...
[nltk_data]   Package averaged_perceptro

Columns: Index(['Timestamp', 'Review'], dtype='object')
                     Timestamp  \
0  2025/02/10 7:40:54 pm GMT+8   
1  2025/02/10 7:41:00 pm GMT+8   
2  2025/02/10 7:41:19 pm GMT+8   
3  2025/02/10 7:46:40 pm GMT+8   
4  2025/02/10 7:46:43 pm GMT+8   

                                                                                                                                                                                                                                                                                                                                                         Review  
0                                                                                                                                                                                                                                                                                                          Im happy with uniten actually, even the people are W  
1                              

  return BeautifulSoup(text, "html.parser").get_text()


                                                                                                                                                                                                                                                                                                                                                         Review  \
0                                                                                                                                                                                                                                                                                                          Im happy with uniten actually, even the people are W   
1                                                                                                                                                                                                                                                                                      Iâm havin