In [78]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [69]:
df = pd.read_csv("../Data/Raw/IMDB Dataset.csv")

In [70]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
def clean_text(text):
    """
    Cleans and preprocesses a given text string.

    Steps:
    1. Converts text to lowercase.
    2. Removes HTML tags.
    3. Removes URLs starting with http(s) or www.
    4. Reduces repeated punctuation marks (!, ?, .) to a single instance.
    5. Limits repeated characters to a maximum of two (e.g., 'loooove' -> 'loove').
    6. Removes all numbers and fractions.
    7. Removes all non-alphanumeric characters (punctuation, symbols).
    8. Replaces multiple spaces with a single space and trims leading/trailing spaces.
    9. Tokenizes text into words.
    10. Removes English stop words.
    11. Lemmatizes each word to its base form.
    12. Joins the words back into a single cleaned string.

    Parameters:
        text (str): The raw input text to clean.

    Returns:
        str: The cleaned and preprocessed text.
    """
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'([!?.])\1+', r'\1', text)  # Reduce repeated punctuation (!, ?, .)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # Limit repeated characters to 2
    text = re.sub(r'\d+/\d+|\d+', '', text)  # Remove numbers and fractions
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and symbols
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and trim

    words = text.split()  # Split text into words
    stop_words = set(stopwords.words('english'))  # Get English stop words
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    words = [word for word in words if word not in stop_words]  # Remove stop words
    words = " ".join(lemmatizer.lemmatize(word) for word in words)  # Lemmatize and join words

    return words  # Return cleaned text


In [73]:
df['cleaned_text'] = df['review'].apply(clean_text)

In [77]:
df[['review', 'cleaned_text', 'sentiment']].tail()

Unnamed: 0,review,cleaned_text,sentiment
49995,I thought this movie did a down right good job...,thought movie right good job wasnt creative or...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",bad plot bad dialogue bad acting idiotic direc...,negative
49997,I am a Catholic taught in parochial elementary...,catholic taught parochial elementary school nu...,negative
49998,I'm going to have to disagree with the previou...,im going disagree previous comment side maltin...,negative
49999,No one expects the Star Trek movies to be high...,one expects star trek movie high art fan expec...,negative


In [81]:
df[['cleaned_text', 'sentiment']].to_csv(
    "../Data/processed_data/cleaned_reviews.csv",
    index=False
)