Preprocessing
This notebook manipulates the review texts. It removes the stopwords, tokenizes each review, then stemms and lemmatizes the tokens. Next with nltk package sentiment of each review is determined. Last a tf-idf and Bag-of-Words algorithm is implemented. The resulting files are saved to data/intermediate

In [1]:
import os
import json
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Initialize stemmer, lemmatizer, and sentiment analyzer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lucie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lucie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lucie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lucie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# --- File Paths Setup ---
# Base directory (go 2 levels up from /src1/)
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define input/output directories using relative paths
input_dir = os.path.join(base_dir, "data", "intermediate")
output_dir = os.path.join(base_dir, "data", "intermediate")

input_filename = "cleaned_reviews_2021-01.json" # Adjust as needed
input_file = os.path.join(input_dir, input_filename)

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [4]:
# Function to read JSON file line by line
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reviews = [json.loads(line) for line in file]
    return pd.DataFrame(reviews)

# Function for cleaning, tokenizing, and stemming/lemmatizing text
def preprocess_text(text):   
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Apply stemming and lemmatization
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return {
        "tokens": filtered_tokens,
        "stemmed": stemmed_tokens,
        "lemmatized": lemmatized_tokens,
    }

In [5]:
# Sentiment Analysis Function
def analyze_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores["compound"] >= 0.05:
        return "positive"
    elif scores["compound"] <= -0.05:
        return "negative"
    else:
        return "neutral"

TF-IDF Function and Bag of Words

In [6]:
# TF-IDF Vectorization function
def process_tfidf(df):
    # Apply preprocessing and lemmatization
    df['processed_text'] = df['text'].apply(preprocess_text)
    df['lemmatized_text'] = df['processed_text'].apply(lambda x: ' '.join(x['lemmatized']))
    
    # Apply optimized TF-IDF Vectorization
    vectorizer = TfidfVectorizer(
        max_features=5000,  # Limit vocabulary size for efficiency
        sublinear_tf=True,  # Scale term frequency logarithmically
        max_df=0.95,  # Ignore very common words
        min_df=5,  # Ignore very rare words
        ngram_range=(1, 2),  # Consider unigrams and bigrams
    )
    
    tfidf_matrix = vectorizer.fit_transform(df['lemmatized_text'])
    
    # Convert TF-IDF matrix to DataFrame efficiently
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    
    return df, tfidf_df

# Bag-of-Words Vectorization function
def process_bag_of_words(df):
    # Extract the text from each review
    texts = df['text'].tolist()

    # Initialize the CountVectorizer
    vectorizer = CountVectorizer(stop_words='english', lowercase=True, max_features=1000)

    # Fit and transform the texts into BoW vectors
    bow_matrix = vectorizer.fit_transform(texts)

    # Convert the matrix to an array for easier viewing
    bow_array = bow_matrix.toarray()

    # Get the feature names (words in the vocabulary)
    feature_names = vectorizer.get_feature_names_out()

    # Create a DataFrame for the BoW vectors
    bow_df = pd.DataFrame(bow_array, columns=feature_names)

    return bow_df






In [8]:
# Save processed data to files
def save_processed_data(df, tfidf_df, bow_df):
    # Save processed reviews
    df[['review_id', 'processed_text']].to_json(os.path.join(output_dir, "processed_reviews_tokens.json"), orient='records', lines=True)
    
    # Save lemmatized text
    df[['review_id', 'lemmatized_text']].to_json(os.path.join(output_dir, "lemmatized_text.json"), orient='records', lines=True)
    
    # Save TF-IDF scores
    tfidf_df.to_csv(os.path.join(output_dir, "tfidf_scores.csv"), index=False)
    
    # Save BoW vectors
    bow_df.to_csv(os.path.join(output_dir, "bow_vectors.csv"), index=False)

    print("Processed data saved to files.")

In [10]:
# Load the dataset
df = load_dataset(input_file)

# Apply Sentiment Analysis
df['sentiment'] = df['text'].apply(analyze_sentiment)
df.to_json(os.path.join(output_dir, "reviews_with_sentiment.json"), orient="records", lines=True)

In [None]:
# Process TF-IDF (Takes a while to run)
processed_df, tfidf_df = process_tfidf(df)
print(processed_df[['review_id', 'text', 'sentiment']].head())
print(tfidf_df.head())

                review_id                                               text  \
0  iBUJvIOkToh2ZECVNq5PDg  ive been eating at this restaurant for over 5 ...   
1  HgEofz6qEQqKYPT7YLA34w  how does a delivery person from here get lost ...   
2  Kxo5d6EOnOE-vERwQf2a1w  the service is always good the employees are n...   
3  STqHwh6xd05bgS6FoAgRqw  two words whipped feta explosion of amazingnes...   
4  75Ckhq13s7k-crts_0MY9g  place was great as well as parking food was go...   

  sentiment  
0  positive  
1  positive  
2  positive  
3  positive  
4  positive  
    10  10 min  10 minute  10 star  10 year  100  1000  1010  1010 would   11  \
0  0.0     0.0        0.0      0.0      0.0  0.0   0.0   0.0         0.0  0.0   
1  0.0     0.0        0.0      0.0      0.0  0.0   0.0   0.0         0.0  0.0   
2  0.0     0.0        0.0      0.0      0.0  0.0   0.0   0.0         0.0  0.0   
3  0.0     0.0        0.0      0.0      0.0  0.0   0.0   0.0         0.0  0.0   
4  0.0     0.0        0.0     

In [12]:
# Process Bag-of-Words
bow_df = process_bag_of_words(df)
print(bow_df.head())

   10  100  1010  12  15  20  2020  2021  25  30  ...  years  yelp  yes  \
0   0    0     0   0   0   0     0     0   0   0  ...      1     0    0   
1   0    0     0   0   0   0     0     0   0   0  ...      0     0    0   
2   0    0     0   0   0   0     0     0   0   0  ...      0     0    0   
3   0    0     0   0   0   0     0     0   0   0  ...      0     0    0   
4   0    0     0   0   0   0     0     0   0   0  ...      0     0    0   

   yesterday  youll  young  youre  yum  yummy  zero  
0          0      0      0      0    0      0     0  
1          0      0      0      0    0      0     0  
2          0      0      0      0    0      0     0  
3          0      0      0      0    0      0     0  
4          0      0      0      0    0      0     0  

[5 rows x 1000 columns]


In [13]:
# Save processed data (Also takes a while to run :))
save_processed_data(processed_df, tfidf_df, bow_df)

Processed data saved to files.
