# Preprocessing for NLP 2

### Imports

In [1]:
import pandas as pd
import sys
import os

# Get the current working directory (typically where the notebook is located or executed)
current_dir = os.getcwd()

# Define the project root relative to the current working directory
project_root = os.path.abspath(os.path.join(current_dir, "../"))  # Adjust this based on your folder structure

# Add project_root to sys.path
sys.path.append(project_root)

from src.preprocessing.text_cleaning import LowercaseText, RemoveSpecialCharacters, RemoveExtraWhitespace, PreprocessWithSpacy



  from .autonotebook import tqdm as notebook_tqdm


### Loading the data

In [None]:
data_path = '../data/cleaned/cleaned_reviews.csv'
df = pd.read_csv(data_path)
df 

Unnamed: 0,text,rating,date,location,review_length,word_count
0,Delicious Thai restaurant! If you're looking ...,5,"Dec 4, 2024","Temecula, CA",198,33
1,"Came here on Valentine, and the foods were goo...",3,"Feb 15, 2025","Jacksonville, FL",485,82
2,unfortunately my husband reacts to msg (ajinom...,1,"Oct 25, 2024","Kailua, HI",733,131
3,Food was delicious...the only drawback is the ...,4,"Jan 7, 2025","San Rafael, CA",165,26
4,Great place for Thai food and just as long as ...,4,"Feb 6, 2024","San Clemente, CA",345,69
5,Really good Thai food. Large portions and ful...,5,"Jan 2, 2024","Cupertino, CA",336,59
6,Came here looklng for Indonesian food for dinn...,4,"Feb 22, 2024","Portishead, United Kingdom",509,98
7,The food is good and the people are nice. The ...,4,"Mar 29, 2024","Lakeside, CA",129,29
8,We were hungry and we when in and this was abs...,5,"Aug 24, 2022","Calgary, Canada",162,32
9,Overall food was good. I enjoyed the papaya sa...,5,"Jan 1, 2024","Champaign, IL",224,39


In [3]:
print("Lowercaseing the text")
df['text'] = df['text'].astype(str).apply(LowercaseText().process)
df

Lowercaseing the text


Unnamed: 0,text,rating,date,location,review_length,word_count
0,delicious thai restaurant! if you're looking ...,5,"Dec 4, 2024","Temecula, CA",198,33
1,"came here on valentine, and the foods were goo...",3,"Feb 15, 2025","Jacksonville, FL",485,82
2,unfortunately my husband reacts to msg (ajinom...,1,"Oct 25, 2024","Kailua, HI",733,131
3,food was delicious...the only drawback is the ...,4,"Jan 7, 2025","San Rafael, CA",165,26
4,great place for thai food and just as long as ...,4,"Feb 6, 2024","San Clemente, CA",345,69
5,really good thai food. large portions and ful...,5,"Jan 2, 2024","Cupertino, CA",336,59
6,came here looklng for indonesian food for dinn...,4,"Feb 22, 2024","Portishead, United Kingdom",509,98
7,the food is good and the people are nice. the ...,4,"Mar 29, 2024","Lakeside, CA",129,29
8,we were hungry and we when in and this was abs...,5,"Aug 24, 2022","Calgary, Canada",162,32
9,overall food was good. i enjoyed the papaya sa...,5,"Jan 1, 2024","Champaign, IL",224,39


In [4]:
print("Removing Special Characters in the text")
df['text'] = df['text'].astype(str).apply(RemoveSpecialCharacters().process)
df

Removing Special Characters in the text


Unnamed: 0,text,rating,date,location,review_length,word_count
0,delicious thai restaurant if youre looking fo...,5,"Dec 4, 2024","Temecula, CA",198,33
1,came here on valentine and the foods were good...,3,"Feb 15, 2025","Jacksonville, FL",485,82
2,unfortunately my husband reacts to msg ajinomo...,1,"Oct 25, 2024","Kailua, HI",733,131
3,food was deliciousthe only drawback is the fac...,4,"Jan 7, 2025","San Rafael, CA",165,26
4,great place for thai food and just as long as ...,4,"Feb 6, 2024","San Clemente, CA",345,69
5,really good thai food large portions and full...,5,"Jan 2, 2024","Cupertino, CA",336,59
6,came here looklng for indonesian food for dinn...,4,"Feb 22, 2024","Portishead, United Kingdom",509,98
7,the food is good and the people are nice the s...,4,"Mar 29, 2024","Lakeside, CA",129,29
8,we were hungry and we when in and this was abs...,5,"Aug 24, 2022","Calgary, Canada",162,32
9,overall food was good i enjoyed the papaya sal...,5,"Jan 1, 2024","Champaign, IL",224,39


In [5]:
print("Removing Extra Whitespaces in the text")
df['text'] = df['text'].astype(str).apply(RemoveExtraWhitespace().process)
df

Removing Extra Whitespaces in the text


Unnamed: 0,text,rating,date,location,review_length,word_count
0,delicious thai restaurant if youre looking for...,5,"Dec 4, 2024","Temecula, CA",198,33
1,came here on valentine and the foods were good...,3,"Feb 15, 2025","Jacksonville, FL",485,82
2,unfortunately my husband reacts to msg ajinomo...,1,"Oct 25, 2024","Kailua, HI",733,131
3,food was deliciousthe only drawback is the fac...,4,"Jan 7, 2025","San Rafael, CA",165,26
4,great place for thai food and just as long as ...,4,"Feb 6, 2024","San Clemente, CA",345,69
5,really good thai food large portions and full ...,5,"Jan 2, 2024","Cupertino, CA",336,59
6,came here looklng for indonesian food for dinn...,4,"Feb 22, 2024","Portishead, United Kingdom",509,98
7,the food is good and the people are nice the s...,4,"Mar 29, 2024","Lakeside, CA",129,29
8,we were hungry and we when in and this was abs...,5,"Aug 24, 2022","Calgary, Canada",162,32
9,overall food was good i enjoyed the papaya sal...,5,"Jan 1, 2024","Champaign, IL",224,39


In [6]:
print("Preprocessing the text With Spacy")
df['text'] = df['text'].astype(str).apply(PreprocessWithSpacy().process)
df

Preprocessing the text With Spacy


Unnamed: 0,text,rating,date,location,review_length,word_count
0,delicious thai restaurant look fresh flavorful...,5,"Dec 4, 2024","Temecula, CA",198,33
1,come valentine food good order tomyum soup duc...,3,"Feb 15, 2025","Jacksonville, FL",485,82
2,unfortunately husband react msg ajinomoto flav...,1,"Oct 25, 2024","Kailua, HI",733,131
3,food deliciousthe drawback fact card accept pa...,4,"Jan 7, 2025","San Rafael, CA",165,26
4,great place thai food long know extremely busy...,4,"Feb 6, 2024","San Clemente, CA",345,69
5,good thai food large portion flavor place busy...,5,"Jan 2, 2024","Cupertino, CA",336,59
6,come looklng indonesian food dinner early even...,4,"Feb 22, 2024","Portishead, United Kingdom",509,98
7,food good people nice sitting look menu order ...,4,"Mar 29, 2024","Lakeside, CA",129,29
8,hungry absolutely solid thai partner love pad ...,5,"Aug 24, 2022","Calgary, Canada",162,32
9,overall food good enjoy papaya salad habe get ...,5,"Jan 1, 2024","Champaign, IL",224,39


In [7]:
df.loc[2]["text"]

'unfortunately husband react msg ajinomoto flavor enhancer normally thai food think safe thai restaurant safe talk waitress reaction waitress kindly change order avoid possibility msg currently emergency room bad reaction food despite waitress assurance cook avoid flavor msg food bland bland contain msg specify msg say allergic need hospitalwe avoid home future service good msg confusion ambience good'

### Saving the initally cleaned dataset

In [None]:
df.to_csv('../data/processed/cleaned_and_preprocessed_reviews.csv', index=False)