Preprocessing
This notebook manipulates the review texts. It removes the stopwords, tokenizes each review, then stemms and lemmatizes the tokens. Next with nltk package sentiment of each review is determined. Last a tf-idf and Bag-of-Words algorithm is implemented. The resulting files are saved to data/intermediate

In [8]:
import os
import json
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy.sparse import csr_matrix, hstack 
from joblib import Parallel, delayed  # For parallel processing

In [9]:
# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Initialize stemmer, lemmatizer, and sentiment analyzer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tuanl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tuanl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tuanl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\tuanl\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
# --- File Paths Setup ---
# Base directory (go 2 levels up from /src1/)
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define input/output directories using relative paths
input_dir = os.path.join(base_dir, "data", "intermediate")
output_dir = os.path.join(base_dir, "data", "intermediate")

input_filename = "D:/TechLab Project/techlabs-data-science-yelp/data/raw/reviews_2021-01.json" # Adjust as needed
input_file = os.path.join(input_dir, input_filename)

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [11]:
# Function to read JSON file line by line
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reviews = [json.loads(line) for line in file]
    return pd.DataFrame(reviews)

# Function for cleaning, tokenizing, and stemming/lemmatizing text
def preprocess_text(text):   
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Apply stemming and lemmatization
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return {
        "tokens": filtered_tokens,
        "stemmed": stemmed_tokens,
        "lemmatized": lemmatized_tokens,
    }

In [12]:
# Sentiment Analysis Function
def analyze_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores["compound"] >= 0.05:
        return "positive"
    elif scores["compound"] <= -0.05:
        return "negative"
    else:
        return "neutral"

TF-IDF Function

In [13]:
# TF-IDF Vectorization function
def process_tfidf(df):
    # Apply preprocessing and lemmatization
    df['processed_text'] = df['text'].apply(preprocess_text)
    df['lemmatized_text'] = df['processed_text'].apply(lambda x: ' '.join(x['lemmatized']))
    
    # Apply optimized TF-IDF Vectorization
    vectorizer = TfidfVectorizer(
        max_features=5000,  # Limit vocabulary size for efficiency
        sublinear_tf=True,  # Scale term frequency logarithmically
        max_df=0.95,  # Ignore very common words
        min_df=5,  # Ignore very rare words
        ngram_range=(1, 2),  # Consider unigrams and bigrams
    )
    
    tfidf_matrix = vectorizer.fit_transform(df['lemmatized_text'])
    
    # Convert TF-IDF matrix to DataFrame efficiently
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    
    return df, tfidf_df

In [20]:
# Save processed data to files
def save_processed_data(df, tfidf_df, bow_df):
    # Save processed reviews
    df[['review_id', 'processed_text']].to_json(os.path.join(output_dir, "processed_reviews_tokens.json"), orient='records', lines=True)
    
    # Save lemmatized text
    df[['review_id', 'lemmatized_text']].to_json(os.path.join(output_dir, "lemmatized_text.json"), orient='records', lines=True)
    
    # Save TF-IDF scores
    tfidf_df.to_csv(os.path.join(output_dir, "tfidf_scores.csv"), index=False)
    
    # Save BoW vectors
    bow_df.to_csv(os.path.join(output_dir, "bow_vectors.csv"), index=False)

    print("Processed data saved to files.")

In [19]:
# Load the dataset
df = load_dataset(input_file)

# Apply Sentiment Analysis
df['sentiment'] = df['text'].apply(analyze_sentiment)
df.to_json(os.path.join(output_dir, "reviews_with_sentiment.json"), orient="records", lines=True)

In [21]:
# Process TF-IDF (Takes a while to run)
processed_df, tfidf_df = process_tfidf(df)
print(processed_df[['review_id', 'text', 'sentiment']].head())
print(tfidf_df.head())

                review_id                                               text  \
0  iBUJvIOkToh2ZECVNq5PDg  I've been eating at this restaurant for over 5...   
1  HgEofz6qEQqKYPT7YLA34w  How does a delivery person from here get lost ...   
2  Kxo5d6EOnOE-vERwQf2a1w  The service is always good, the employees are ...   
3  STqHwh6xd05bgS6FoAgRqw  two words: whipped. feta. \nexplosion of amazi...   
4  75Ckhq13s7k-crts_0MY9g  Place was great as well as parking. \nFood was...   

  sentiment  
0  positive  
1  positive  
2  positive  
3  positive  
4  positive  
    00  000   10  10 10  10 15  10 min  10 minute  10 recommend  10 star  \
0  0.0  0.0  0.0    0.0    0.0     0.0        0.0           0.0      0.0   
1  0.0  0.0  0.0    0.0    0.0     0.0        0.0           0.0      0.0   
2  0.0  0.0  0.0    0.0    0.0     0.0        0.0           0.0      0.0   
3  0.0  0.0  0.0    0.0    0.0     0.0        0.0           0.0      0.0   
4  0.0  0.0  0.0    0.0    0.0     0.0        0.0      

Bag of Words Function

In [3]:
# Load the JSON data
with open('D:/TechLab Project/techlabs-data-science-yelp/data/raw/reviews_2021-01.json', 'r') as file:
    data = [json.loads(line) for line in file]

# Extract the relevant fields
reviews = [{'text': entry['text'], 'stars': entry['stars']} for entry in data]

# Preprocess the text
stemmer = SnowballStemmer("english")  # Faster stemmer

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and stem
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Parallelize text preprocessing
print("Preprocessing text...")
texts = Parallel(n_jobs=-1)(delayed(preprocess_text)(review['text']) for review in reviews)

# Label the stars
def label_stars(stars):
    if 1 <= stars <= 2:
        return 0  # Negative
    elif 4 <= stars <= 5:
        return 1  # Positive
    else:
        return 2  # Neutral

print("Labeling stars...")
sentiment_labels = [label_stars(review['stars']) for review in reviews]

# Create the Bag-of-Words representation
print("Creating Bag-of-Words representation...")
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 1), binary=True)  # Limit vocabulary size and use binary features
bow_matrix = vectorizer.fit_transform(texts)  # This is a sparse matrix

# Convert the BoW matrix to a DataFrame (sparse format)
bow_df = pd.DataFrame.sparse.from_spmatrix(bow_matrix, columns=vectorizer.get_feature_names_out())

# Add the 'text' and 'sentiment label' columns to the DataFrame
bow_df['text'] = texts  # Add the preprocessed text column
bow_df['sentiment label'] = sentiment_labels  # Add the sentiment label column

# Reorder columns to have 'text' and 'sentiment label' as the first two columns
cols = ['text', 'sentiment label'] + [col for col in bow_df.columns if col not in ['text', 'sentiment label']]
bow_df = bow_df[cols]

# Save the result to a CSV file
print("Saving to CSV...")
bow_df.to_csv('D:/TechLab Project/techlabs-data-science-yelp/data/intermediate/bow_vectors.csv', index=False)

print("Bag-of-Words representation saved to 'reviews_bow.csv'")

Preprocessing text...
Labeling stars...
Creating Bag-of-Words representation...
Saving to CSV...


KeyboardInterrupt: 