In [1]:
import nltk

# Download the required resources
nltk.download('wordnet')       # For lemmatization
nltk.download('stopwords')     # For stop words
nltk.download('punkt')         # For tokenization


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joenorton/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joenorton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joenorton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
# Step 1: Data Cleaning
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
recipe_reviews_file_path = '/Users/joenorton/Desktop/DSA460/RecipeProject/Recipe Reviews and User Feedback Dataset.csv'  # Replace with your file path
recipe_reviews_df = pd.read_csv(recipe_reviews_file_path)

# Remove rows with 0 stars (indicating no reviews)
recipe_reviews_cleaned_df = recipe_reviews_df[recipe_reviews_df['stars'] != 0]

# Drop rows with any missing values in 'text' or 'stars' columns
recipe_reviews_cleaned_df = recipe_reviews_cleaned_df.dropna(subset=['text', 'stars'])

# Step 2: Text Preprocessing - Cleaning Text (removal of punctuation, stop words, and lemmatization)
# Initialize the lemmatizer and stop words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Clean text function: remove punctuation, stop words, and lemmatize
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Tokenize by whitespace
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]  # Lemmatize and remove stop words
    return " ".join(words)

# Apply the cleaning function to the 'text' column
recipe_reviews_cleaned_df['cleaned_text'] = recipe_reviews_cleaned_df['text'].apply(clean_text)

# Step 3: TF-IDF Vectorization with scikit-learn's built-in stopwords
# Apply the TF-IDF vectorizer to the 'cleaned_text' data (with stopwords removal built into scikit-learn)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # Limiting to top 5000 features

# Apply the vectorizer to the cleaned text data
X_tfidf = tfidf_vectorizer.fit_transform(recipe_reviews_cleaned_df['cleaned_text'])

# Step 4: Feature Normalization (Standard Scaling)
scaler = StandardScaler(with_mean=False)  # Use with_mean=False because sparse matrices are used

# Fit and transform the TF-IDF data
X_scaled = scaler.fit_transform(X_tfidf)

# The target variable is 'stars'
y = recipe_reviews_cleaned_df['stars']

# Display the shape of the transformed TF-IDF data and the first few rows of the cleaned text data
print(X_scaled.shape)  # Output the shape of the TF-IDF feature matrix
print(recipe_reviews_cleaned_df[['text', 'cleaned_text']].head())  # Display the cleaned text data


(16484, 5000)
                                                text  \
0  I tweaked it a little, removed onions because ...   
1  Bush used to have a white chili bean and it ma...   
2  I have a very complicated white chicken chili ...   
5  amazing! my boyfriend loved it so much! going ...   
6  Wow!!!  This recipe is excellent as written!! ...   

                                        cleaned_text  
0  tweaked little removed onion onion hater house...  
1  bush used white chili bean made recipe super s...  
2  complicated white chicken chili recipe made ye...  
5       amazing boyfriend loved much going make week  
6  wow recipe excellent written change made used ...  


# Step 1: Data Cleaning

Load the dataset

Remove rows with 0 stars (indicating no reviews)

Drop rows with any missing values in 'text' or 'stars' columns

In [None]:

import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
recipe_reviews_file_path = '/Users/joenorton/Desktop/DSA460/RecipeProject/Recipe Reviews and User Feedback Dataset.csv'
recipe_reviews_df = pd.read_csv(recipe_reviews_file_path)

# Remove rows with 0 stars (indicating no reviews)
recipe_reviews_cleaned_df = recipe_reviews_df[recipe_reviews_df['stars'] != 0]
print("After removing rows with 0 stars:")
print(recipe_reviews_cleaned_df.head())  # Display the cleaned data after removing rows with 0 stars

# Drop rows with any missing values in 'text' or 'stars' columns
recipe_reviews_cleaned_df = recipe_reviews_cleaned_df.dropna(subset=['text', 'stars'])
print("\nAfter removing rows with missing 'text' or 'stars':")
print(recipe_reviews_cleaned_df.head())  # Display the data after dropping rows with missing values

After removing rows with 0 stars:
   Unnamed: 0  recipe_number  recipe_code         recipe_name  \
0           0              1        14299  Creamy White Chili   
1           1              1        14299  Creamy White Chili   
2           2              1        14299  Creamy White Chili   
5           5              1        14299  Creamy White Chili   
6           6              1        14299  Creamy White Chili   

                                        comment_id         user_id  \
0  sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM  u_9iFLIhMa8QaG   
1  sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY  u_Lu6p25tmE77j   
2  sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP  u_s0LwgpZ8Jsqq   
5  sp_aUSaElGf_14299_c_2Do918IutExN0pWEOFMU4cbiT8v  u_BALTQJIvWtYr   
6  sp_aUSaElGf_14299_c_24hhcbywpsgGqG7yeDFH1IPZCb8  u_HuJVXMzQqJoI   

    user_name  user_reputation  created_at  reply_count  thumbs_up  \
0     Jeri326                1  1665619889            0          0   
1     Mark467 

# Step 2: Text Preprocessing - Cleaning Text (removal of punctuation, stop words, and lemmatization)

Initialize the lemmatizer and stop words

Clean text function: remove punctuation, stop words, and lemmatize

Apply the cleaning function to the 'text' column

In [None]:
# Initialize the lemmatizer and stop words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Clean text function: remove punctuation, stop words, and lemmatize
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Tokenize by whitespace
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]  # Lemmatize and remove stop words
    return " ".join(words)

# Apply the cleaning function to the 'text' column
recipe_reviews_cleaned_df['cleaned_text'] = recipe_reviews_cleaned_df['text'].apply(clean_text)
print("\nAfter cleaning the text (removal of punctuation, stop words, and lemmatization):")
print(recipe_reviews_cleaned_df[['text', 'cleaned_text']].head())  # Display the original and cleaned text


After cleaning the text (removal of punctuation, stop words, and lemmatization):
                                                text  \
0  I tweaked it a little, removed onions because ...   
1  Bush used to have a white chili bean and it ma...   
2  I have a very complicated white chicken chili ...   
5  amazing! my boyfriend loved it so much! going ...   
6  Wow!!!  This recipe is excellent as written!! ...   

                                        cleaned_text  
0  tweaked little removed onion onion hater house...  
1  bush used white chili bean made recipe super s...  
2  complicated white chicken chili recipe made ye...  
5       amazing boyfriend loved much going make week  
6  wow recipe excellent written change made used ...  


In [4]:
# Step 3: TF-IDF Vectorization with scikit-learn's built-in stopwords
# Apply the TF-IDF vectorizer to the 'cleaned_text' data (with stopwords removal built into scikit-learn)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # Limiting to top 5000 features

# Apply the vectorizer to the cleaned text data
X_tfidf = tfidf_vectorizer.fit_transform(recipe_reviews_cleaned_df['cleaned_text'])
print("\nAfter TF-IDF Vectorization (first few TF-IDF features):")
print(X_tfidf.shape)  # Display the shape of the TF-IDF matrix
print("Sample TF-IDF features:", tfidf_vectorizer.get_feature_names_out()[:10])  # Display the first 10 TF-IDF features


After TF-IDF Vectorization (first few TF-IDF features):
(16484, 5000)
Sample TF-IDF features: ['03' '10' '100' '1010' '1012' '1015' '1034' '105' '10min' '10x']


In [6]:
# Step 4: Feature Normalization (Standard Scaling)
scaler = StandardScaler(with_mean=False)  # Use with_mean=False because sparse matrices are used

# Fit and transform the TF-IDF data
X_scaled = scaler.fit_transform(X_tfidf)
print("\nAfter feature normalization (Standard Scaling):")
print(X_scaled.shape)  # Display the shape of the scaled features

# The target variable is 'stars'
y = recipe_reviews_cleaned_df['stars']

# Display the final cleaned text and feature matrix shape
print("\nFinal dataset for model training:")
print(recipe_reviews_cleaned_df[['text', 'cleaned_text', 'stars']].head())  # Display final data

final_df = recipe_reviews_cleaned_df[['text', 'cleaned_text', 'stars']]
final_df.to_csv('final_preprocessed_recipe_reviews.csv', index=False)  # Save to CSV
print("Final preprocessed dataset saved to 'final_preprocessed_recipe_reviews.csv'")


After feature normalization (Standard Scaling):
(16484, 5000)

Final dataset for model training:
                                                text  \
0  I tweaked it a little, removed onions because ...   
1  Bush used to have a white chili bean and it ma...   
2  I have a very complicated white chicken chili ...   
5  amazing! my boyfriend loved it so much! going ...   
6  Wow!!!  This recipe is excellent as written!! ...   

                                        cleaned_text  stars  
0  tweaked little removed onion onion hater house...      5  
1  bush used white chili bean made recipe super s...      5  
2  complicated white chicken chili recipe made ye...      5  
5       amazing boyfriend loved much going make week      5  
6  wow recipe excellent written change made used ...      5  
Final preprocessed dataset saved to 'final_preprocessed_recipe_reviews.csv'


In [7]:
# Combine original columns (including 'text', 'stars', and 'cleaned_text') with the TF-IDF and scaled features
# Reapply all previous transformations

# Re-apply text preprocessing (if needed)
recipe_reviews_cleaned_df['cleaned_text'] = recipe_reviews_cleaned_df['text'].apply(clean_text)

# Apply the TF-IDF vectorizer to the 'cleaned_text' column
X_tfidf = tfidf_vectorizer.fit_transform(recipe_reviews_cleaned_df['cleaned_text'])

# Create a DataFrame for the TF-IDF features
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine the TF-IDF features with the original columns (text, stars, cleaned_text)
combined_df_tfidf = pd.concat([recipe_reviews_cleaned_df[['text', 'cleaned_text', 'stars']], X_tfidf_df], axis=1)

# Save the dataframe with TF-IDF features to a CSV file
combined_df_tfidf.to_csv('recipe_reviews_with_tfidf_features.csv', index=False)
print("Data with TF-IDF features saved to 'recipe_reviews_with_tfidf_features.csv'")

# Apply feature scaling (standardization) to the TF-IDF features
X_scaled = scaler.fit_transform(X_tfidf)

# Create a DataFrame for the scaled features
X_scaled_df = pd.DataFrame(X_scaled, columns=tfidf_vectorizer.get_feature_names_out())

# Combine the scaled features with the original columns (text, stars, cleaned_text)
combined_df_scaled = pd.concat([recipe_reviews_cleaned_df[['text', 'cleaned_text', 'stars']], X_scaled_df], axis=1)

# Save the dataframe with scaled features to a CSV file
combined_df_scaled.to_csv('recipe_reviews_with_scaled_features.csv', index=False)
print("Data with scaled features saved to 'recipe_reviews_with_scaled_features.csv'")

# Final dataset combining the cleaned text, stars, TF-IDF, and scaled features
final_df = pd.concat([recipe_reviews_cleaned_df[['text', 'cleaned_text', 'stars']], X_scaled_df], axis=1)

# Save the final preprocessed dataset with features to a CSV file
final_df.to_csv('final_preprocessed_recipe_reviews_with_features.csv', index=False)
print("Final preprocessed dataset with features saved to 'final_preprocessed_recipe_reviews_with_features.csv'")


Data with TF-IDF features saved to 'recipe_reviews_with_tfidf_features.csv'


ValueError: Shape of passed values is (16484, 1), indices imply (16484, 5000)

In [10]:
import pandas as pd
import string
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
file_path = '/Users/joenorton/Desktop/DSA460/RecipeProject/Recipe Reviews and User Feedback Dataset.csv'
df = pd.read_csv(file_path)

# Initialize stemmer
ps = PorterStemmer()

# Step 1: Drop rows with 0 stars
df = df[df['stars'] != 0]

# Step 2: Define text cleaning function (without stopword removal and lemmatization)
def clean_text_stemming_only(text):
    # Handle missing values (fill NaN with empty string)
    if not isinstance(text, str):
        text = ""
    
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Split text into words manually (space-separated tokens)
    words = text.split()

    # Apply stemming
    words = [ps.stem(word) for word in words]

    # Join words back into a single string
    cleaned_text = " ".join(words)
    return cleaned_text

# Step 3: Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text_stemming_only)

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features to avoid too large a matrix
X_tfidf = vectorizer.fit_transform(df['cleaned_text'])

# Step 5: Min-Max normalization on numerical features
numerical_features = ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'best_score']
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Save the final preprocessed dataset with features to a CSV file
df.to_csv('final_preprocessed_recipe_reviews_with_features1.csv', index=False)
print("Final preprocessed dataset with features saved to 'final_preprocessed_recipe_reviews_with_features1.csv'")

# Display a sample of the transformed data
print(df.head())


Final preprocessed dataset with features saved to 'final_preprocessed_recipe_reviews_with_features1.csv'
   Unnamed: 0  recipe_number  recipe_code         recipe_name  \
0           0              1        14299  Creamy White Chili   
1           1              1        14299  Creamy White Chili   
2           2              1        14299  Creamy White Chili   
5           5              1        14299  Creamy White Chili   
6           6              1        14299  Creamy White Chili   

                                        comment_id         user_id  \
0  sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM  u_9iFLIhMa8QaG   
1  sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY  u_Lu6p25tmE77j   
2  sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP  u_s0LwgpZ8Jsqq   
5  sp_aUSaElGf_14299_c_2Do918IutExN0pWEOFMU4cbiT8v  u_BALTQJIvWtYr   
6  sp_aUSaElGf_14299_c_24hhcbywpsgGqG7yeDFH1IPZCb8  u_HuJVXMzQqJoI   

    user_name  user_reputation  created_at  reply_count  thumbs_up  \
0     Jeri326