# NLP Preprocessing for Recipe Review Dataset
This notebook demonstrates how to clean, preprocess, and transform text reviews into usable features for machine learning classification based on star ratings.

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler


## Step 1: Load Dataset

In [2]:
df = pd.read_csv("Recipe Reviews and User Feedback Dataset.csv")
df = df[df['stars'] > 0]  # Remove rows with 0-star ratings
df[['text', 'stars', 'thumbs_up', 'thumbs_down']].head()


Unnamed: 0,text,stars,thumbs_up,thumbs_down
0,"I tweaked it a little, removed onions because ...",5,0,0
1,Bush used to have a white chili bean and it ma...,5,7,0
2,I have a very complicated white chicken chili ...,5,3,0
5,amazing! my boyfriend loved it so much! going ...,5,3,1
6,Wow!!! This recipe is excellent as written!! ...,5,11,0


## Step 2: Download NLTK Resources

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Step 3: Initialize NLP Tools

In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


## Step 4: Define Text Preprocessing Function

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


## Step 5: Apply Text Preprocessing

In [6]:
df_clean = df[['text', 'stars', 'thumbs_up', 'thumbs_down']].dropna()
df_clean = df_clean[df_clean['text'].str.strip() != '']
df_clean['processed_text'] = df_clean['text'].apply(preprocess_text)
df_clean[['text', 'processed_text', 'stars']].head()


Unnamed: 0,text,processed_text,stars
0,"I tweaked it a little, removed onions because ...",tweak littl remov onion onion hater hous use i...,5
1,Bush used to have a white chili bean and it ma...,bush use white chili bean made recip super sim...,5
2,I have a very complicated white chicken chili ...,complic white chicken chili recip made year ev...,5
5,amazing! my boyfriend loved it so much! going ...,amaz boyfriend love much go make week,5
6,Wow!!! This recipe is excellent as written!! ...,wow recip excel written chang made use oz jar ...,5


## Step 6: TF-IDF Vectorization

In [7]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_clean['processed_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.head()


Unnamed: 0,abl,absolut,accord,actual,ad,adapt,add,addit,adjust,admit,...,yeast,yellow,yesterday,yield,yogurt,youll,yr,yum,yummi,zucchini
0,0.0,0.151083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Step 7: Normalize Numerical Features

In [8]:
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df_clean[['thumbs_up', 'thumbs_down']]), columns=['thumbs_up_norm', 'thumbs_down_norm'])
scaled_df.head()


Unnamed: 0,thumbs_up_norm,thumbs_down_norm
0,0.0,0.0
1,0.066038,0.0
2,0.028302,0.0
3,0.028302,0.007937
4,0.103774,0.0


## Step 8: Combine Features and Export

In [9]:
final_df = pd.concat([df_clean[['stars']], scaled_df, tfidf_df], axis=1)
final_df.to_excel("final_preprocessed_dataset.xlsx", index=False)
final_df.head()


Unnamed: 0,stars,thumbs_up_norm,thumbs_down_norm,abl,absolut,accord,actual,ad,adapt,add,...,yeast,yellow,yesterday,yield,yogurt,youll,yr,yum,yummi,zucchini
0,5.0,0.0,0.0,0.0,0.151083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,0.066038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.028302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5.0,0.056604,0.0,0.0,0.133056,0.0,0.0,0.0,0.0,0.107099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
