In [1]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from googletrans import Translator
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Initialize tools
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
translator = Translator()

# Exclude "UNITEN" from stopword removal
custom_exceptions = {"uniten","UNITEN","Uniten"}

# Function to translate text to English
def translate_text(text):
    try:
        translated = translator.translate(text, dest='en')
        return translated.text
    except Exception as e:
        print(f"Translation Error: {e}")
        return text  # Return original text if translation fails

# Function to remove stopwords but keep "UNITEN"
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words or word in custom_exceptions]
    return " ".join(filtered_words)

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Function to remove numbers from text
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Function to lemmatize text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()  # Tokenize text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize each word
    return ' '.join(lemmatized_words)  # Join words back into a sentence

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML
    text = emoji.replace_emoji(text, replace='')  # Remove emojis
    text = remove_punctuation(text)  # ✅ Now defined!
    text = remove_numbers(text)  # ✅ Now defined!
    text = spell(text)
    text = remove_stopwords(text)  # Remove stopwords (keeping "UNITEN")
    text = lemmatize_text(text)  # ✅ Now defined!
    text = word_tokenize(text)  # Tokenization
    return text

# Load dataset
df = pd.read_csv("UNITENReview.csv")  # Replace with your actual dataset

# Apply preprocessing
df["processed"] = df["Review"].apply(preprocess_text)

# Save the cleaned dataset
df.to_csv("Processed_Reviews2.csv", index=False)

# Display results
print(df[["Review", "processed"]].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML


                                              Review  \
0  Im happy with uniten actually, even the people...   
1  I’m having a pretty good time here, happy to m...   
2        a very neutral place in terms of everything   
3  I would say Uniten it's  a good university  bu...   
4   UNITEN is well-regarded, particularly for its...   

                                           processed  
0     [im, happy, united, actually, even, people, w]  
1  [i, ’, m, pretty, good, time, happy, meet, w, ...  
2                 [neutral, place, term, everything]  
3  [would, say, united, good, university, issue, ...  
4  [united, wellregarded, particularly, strong, e...  
