In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

dataVer10 = "review-Vermont-10.json.gz"
dataVer = "review-Vermont.json.gz"
dataMeta = "meta-Vermont.json.gz"

df_meta = pd.read_json(dataMeta, lines=True, compression="gzip")
df_ver = pd.read_json(dataVer, lines=True, compression="gzip")

In [None]:
#merge into one dataframe
df = df_ver.merge(df_meta, on="gmap_id", how="left")

In [None]:
df.head()

In [None]:
#remove duplicates user_id and gmap_id remove user reviewing a place multiple times
print("Before:", len(df))
df = df.drop_duplicates(subset=['user_id', 'gmap_id'])
print("After:", len(df))

In [None]:
#drop any with none rating
print("Before:", len(df))
df = df.dropna(subset=['rating'])
print("After:", len(df))

In [None]:
#drop any with none text reviews
print("Before:", len(df))
df = df.dropna(subset=["text"])
print("After:", len(df))

In [None]:
#number of reviews recalcualte
num_reviews = df.groupby('gmap_id')['rating'].count().rename('num_of_reviews')

#average rating recalculate
avg_rating = df.groupby('gmap_id')['rating'].mean().rename('avg_rating')

# combine back into dataframe
df = df.drop(columns=['num_of_reviews','avg_rating'], errors='ignore')
df = df.merge(num_reviews, on='gmap_id', how='left')
df = df.merge(avg_rating, on='gmap_id', how='left')

In [None]:
#enconding of pricing convert $$$ price to numerical: 0 = none, 1 = $, 2 = $$, 3 = $$$, 4 = $$$$
df["price"] = df["price"].str.len()

In [None]:
from sklearn.feature_extraction import FeatureHasher

# Ensure all entries are lists (empty if missing) nan fix
df['category'] = df['category'].apply(lambda x: x if isinstance(x, list) else [])

# Create the hasher
hasher = FeatureHasher(n_features=20, input_type='string')

# Transform the column
hashed_features = hasher.transform(df['category'])

# Convert sparse matrix to dense array
hashed_array = hashed_features.toarray()

# Make a DataFrame with column names
hashed_df = pd.DataFrame(hashed_array, columns=[f"cat_{i}" for i in range(hashed_array.shape[1])])

#merge back with dataframe
df = pd.concat([df.reset_index(drop=True), hashed_df], axis=1)

In [None]:
#natural numerical categories

cathash_cols = [f"cat_{i}" for i in range(20)]
numerical_cols = ["rating","avg_rating","price", "num_of_reviews","longitude","latitude","time"] 

all_cols = numerical_cols + cathash_cols

#correlation matrix 
corr = df[all_cols].corr()

plt.figure(figsize=(25,15))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
ax = sns.histplot(data=df, x="rating", bins=5, kde=False)

# Title and labels
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")

total = len(df)

for p in ax.patches:
    height = p.get_height()
    percent = 100 * height / total
    x = p.get_x() + p.get_width() / 2
    y = height

    ax.annotate(f"{percent:.1f}%", (x, y), ha='center', va='bottom', fontsize=10)

plt.show()


In [None]:
plt.figure(figsize=(8,5))
ax = sns.histplot(data=df, x="avg_rating", bins=5, kde=False)

plt.title("Distribution of average Business Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")

total = len(df)

for p in ax.patches:
    height = p.get_height()
    percent = 100 * height / total
    x = p.get_x() + p.get_width() / 2
    y = height

    ax.annotate(f"{percent:.1f}%", (x, y), ha='center', va='bottom', fontsize=10)

plt.show()

In [None]:
df["state"].nunique()

In [None]:
df.isna().sum()

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import Ridge
# from sklearn.metrics import accuracy_score, mean_squared_error
# from sklearn.preprocessing import StandardScaler
# from textblob import TextBlob
# import numpy as np

# #feature engineering

# df["len_chars"] = df["text"].str.len()
# df["len_words"] = df["text"].str.split().str.len()

# def blob_features(text):
#     b = TextBlob(text)
#     return pd.Series([b.sentiment.polarity, b.sentiment.subjectivity])

# df[["polarity", "subjectivity"]] = df["text"].apply(blob_features)


# # Feature + label
# X_text = df['text']
# # other features 
# X_extra = df[["time", "price", "num_of_reviews", "longitude", 
#               "latitude", "subjectivity", "polarity"] + cathash_cols]
# y = df['rating']

# # 3. Train/Val split
# X_train_text, X_test_text, X_train_extra, X_test_extra, y_train, y_test = train_test_split(
#     X_text, X_extra, y,
#     test_size=0.2,
#     random_state=42,
#     stratify=y
# )

# # 4. TF-IDF (baseline)
# tfidf = TfidfVectorizer(
#     max_features=50000,
#     ngram_range=(1, 2),
#     min_df=2,
#     max_df=0.95
# )

# X_train_tfidf = tfidf.fit_transform(X_train_text)
# X_test_tfidf = tfidf.transform(X_test_text)

# #predict text TF-IDF
# model_text = Ridge(alpha=1.0)
# model_text.fit(X_train_tfidf, y_train)
# y_pred_text = model_text.predict(X_test_tfidf)

# # numeric_cols = ["time", "num_of_reviews", "longitude", "latitude", "subjectivity", "polarity"] + cathash_cols
# scaler = StandardScaler()
# X_train_numeric_scaled = scaler.fit_transform(X_train_extra[cathash_cols])
# X_test_numeric_scaled = scaler.transform(X_test_extra[cathash_cols])

# #predict numerical
# model_numeric = Ridge(alpha=1.0)
# model_numeric.fit(X_train_numeric_scaled, y_train)
# y_pred_numeric = model_numeric.predict(X_test_numeric_scaled)

# y_pred_final = 0.7*y_pred_text + 0.3*y_pred_numeric
# y_pred_final = np.clip(np.round(y_pred_final), 1, 5)

# print("Accuracy:", accuracy_score(y_test, y_pred_final))
# mse = mean_squared_error(y_test, y_pred_final)
# print("MSE:", mse)
# rmse = np.sqrt(mse)
# print("RMSE:", rmse)


# # Accuracy: 0.647806087428671
# # MSE: 0.5231889848480191
# # RMSE: 0.7233180385197228

In [None]:
# useless = ["name_x", "name_y", "time", "pics", "resp", "address", "relative_results", "state", "url", "latitude", "longitude", "num_of_reviews"]
# maybe = ["description"]
# df = df.drop(columns=useless + maybe)

In [None]:
# df.to_json("clean_Vermont.json.gz", orient="records", lines=True, compression="gzip")