In [8]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# -----------------------------
# Download Required Resources
# -----------------------------
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')

# -----------------------------
# Initialize Tools
# -----------------------------
stop_words = set(stopwords.words('english'))

# Keep negation words (important for sentiment analysis)
for word in ["not", "no", "nor"]:
    if word in stop_words:
        stop_words.remove(word)

lemmatizer = WordNetLemmatizer()

# -----------------------------
# Contractions Dictionary
# -----------------------------
contractions_dict = {
    "i'm": "i am",
    "it's": "it is",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "can't": "cannot",
    "won't": "will not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "couldn't": "could not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "you're": "you are",
    "they're": "they are",
    "we're": "we are"
}

escaped_contractions = [re.escape(c) for c in contractions_dict.keys()]
pattern = r'\b(' + '|'.join(escaped_contractions) + r')\b'
compiled_pattern = re.compile(pattern, flags=re.IGNORECASE)

# -----------------------------
# Cleaning Functions
# -----------------------------

def normalize_apostrophes(text):
    # Fix curly quotes
    text = text.replace("’", "'").replace("‘", "'")
    # Fix spaced forms like i ’ m → i'm
    text = re.sub(r"\b(\w)\s*'\s*(\w)\b", r"\1'\2", text)
    return text

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def replace_contractions(text):
    def replace_match(match):
        return contractions_dict[match.group(0).lower()]
    return compiled_pattern.sub(replace_match, text)

def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered)

def remove_single_letters(text):
    return re.sub(r'\b[a-z]\b', '', text)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

def lemmatize_text(text):
    words = word_tokenize(text, preserve_line=True)
    pos_tags = pos_tag(words, lang='eng')
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
    ]
    return " ".join(lemmatized)

# -----------------------------
# Full Preprocessing Pipeline
# -----------------------------

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = normalize_apostrophes(text)
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_emojis(text)
    text = replace_contractions(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = remove_single_letters(text)
    text = lemmatize_text(text)

    return text

# -----------------------------
# Load Dataset
# -----------------------------

df = pd.read_csv("UNITENReview.csv")

# Apply preprocessing
df["Cleaned_Review"] = df["Review"].apply(preprocess_text)

# Save output
df.to_csv("UNITENReview_Exercise.csv", index=False)

# Preview result
print(df[["Review", "Cleaned_Review"]].head())

                                              Review  \
0  Im happy with uniten actually, even the people...   
1  I’m having a pretty good time here, happy to m...   
2        a very neutral place in terms of everything   
3  I would say Uniten it's  a good university  bu...   
4   UNITEN is well-regarded, particularly for its...   

                                      Cleaned_Review  
0               im happy uniten actually even people  
1                 pretty good time happy meet people  
2                      neutral place term everything  
3  would say uniten good university issue need im...  
4  uniten wellregarded particularly strong engine...  


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/d754b29c-508d-4bde-938d-
[nltk_data]     34c5de7c5059/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/d754b29c-508d-4bde-938d-
[nltk_data]     34c5de7c5059/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/d754b29c-508d-4bde-938d-
[nltk_data]     34c5de7c5059/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/d754b29c-508d-4bde-938d-
[nltk_data]     34c5de7c5059/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/d754b29c-508d-4bde-938d-
[nltk_data]     34c5de7c5059/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data