# Preprocessing:

In [6]:
import re
import string
from wordcloud import STOPWORDS
import pandas as pd

df = pd.read_csv("../data/raw/Dataset.csv")

### cleaning function:

In [7]:
def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML tags like <br>
    text = re.sub(r"<.*?>", " ", text)
    
    # 3. Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # 4. Remove numbers
    text = re.sub(r"\d+", " ", text)
    
    # 5. Remove stopwords
    words = [word for word in text.split() if word not in STOPWORDS]
    
    # 6. Join back into string
    return " ".join(words)


### Applying to data:

In [8]:
df["clean_review"] = df["review"].apply(clean_text)
print(df[["review", "clean_review"]].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                        clean_review  
0  one reviewers mentioned watching oz episode yo...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


### Save preprocessed dataset:

In [9]:

df.to_csv("../data/processed/clean_reviews.csv", index=False)
