# Goodreads Review Sentiment Pipeline (Sample Rebuild)

This is a simplified reconstruction of the PySpark pipeline originally built and tested on GCP Dataproc. The notebook simulates key stages using pandas and sklearn for demonstration purposes.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

## ðŸ“„ Sample Data

In [None]:
# Simulated sample review data
data = {
    'review_text': [
        'Loved this book, amazing story and characters!',
        'Not great. Very slow and boring.',
        'Fantastic read, highly recommend it!',
        'Terrible book. Would not read again.',
        'An okay story, but not very engaging.'
    ],
    'n_votes': [10, 2, 15, 1, 3],
    'n_comments': [2, 0, 5, 1, 1],
    'average_rating': [4.5, 2.3, 4.7, 1.8, 3.0],
    'liked': [1, 0, 1, 0, 0]  # 1 = liked (rating >= 4), 0 = disliked
}
df = pd.DataFrame(data)
df.head()

## ðŸ§ª Feature Engineering

In [None]:
# Text Vectorization
tfidf = TfidfVectorizer()
X_text = tfidf.fit_transform(df['review_text'])

# Combine numerical features
X_numerical = df[['n_votes', 'n_comments', 'average_rating']].values

# Combine all features
from scipy.sparse import hstack
X_all = hstack([X_text, X_numerical])

# Target
y = df['liked']

## ðŸ¤– Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.3, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc:.2f}")

## ðŸ“ˆ ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()