In [None]:
%autosave 0

In [None]:
# Importing Libraries
import json
import csv
import re
import nltk
import numpy as np
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# Read JSON File
csv_root = '''/Users/scl/Python/final/d1/'''
json_name = 'filter_all_t.json'
with open(csv_root+json_name, 'r') as f:
    data = json.load(f)

In [None]:
# Initialize List to Store Text-Rating Pairs
text_rating_pairs = []
#data

In [None]:
# Iterate Over Data in JSON File and Extract Text-Rating Pairs
for key, value in data.items():
    for element in value:
        text = element['review_text']
        rating = element['rating']
        # Append Text-Rating Tuple to List
        text_rating_pairs.append((text, rating))

In [None]:
# Print Length and First 10 Elements of Text-Rating Pairs List
print(len(text_rating_pairs))
print(text_rating_pairs[:10])

In [None]:
# Create New CSV File to Store Text-Rating Pairs
csv_file = 'text_rating_pairs.csv'
with open(csv_root+csv_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['text', 'rating'])
    for text, rating in text_rating_pairs:
        writer.writerow([text, rating])

In [None]:
# Define Text Preprocessing Function
def text_preprocessing(text):
    # Remove Punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove URL
    text = re.sub(r'http\S+', '', text)
    # Remove Stop Words
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Convert to Lowercase
    text = text.lower()
    #  Tokenize Text
    tokens = nltk.word_tokenize(text)
    # Apply Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Apply Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Return Processed Text
    return ' '.join(tokens)

In [None]:
# Test Text Preprocessing Function
text = 'This is a great restaurant with delicious food and friendly service. Highly recommended! https://www.google.com'
print(text_preprocessing(text))

In [None]:
# Define Preprocessing Function to Apply on DataFrame Rows
def preprocess(row):
    text = str(row['text'])
    text_pp = text_preprocessing(text)
    row['text_pp'] = text_pp
    return row

In [None]:
# Read Original CSV and Apply Preprocessing Function
csv_df = pd.read_csv(csv_root+csv_file)
csv_df

In [None]:
# Add Empty 'text_pp' Column and Apply Preprocessing Function to DataFrame
csv_df['text_pp'] = ''
pp_df = csv_df.apply(preprocess, axis=1)
pp_df

In [None]:
'''
The dataset is split into three parts: training, test, and validation sets, with an 8:1:1 ratio. 
First, we divide the dataset into a training set (80%) and a test set (20%). 
Then, we further split the test set into two equal parts to create the test set and validation set (each 10%). 
It is important to ensure that each rating value is equally distributed across the subsets.
'''
train_df, test_df = train_test_split(pp_df, test_size=0.2, stratify=pp_df['rating'])
test_df, val_df = train_test_split(test_df, test_size=0.5, stratify=test_df['rating'])

# Save Training, Test, and Validation Sets as CSV Files
columns_to_save = ['text_pp', 'rating']
train_df[columns_to_save].to_csv(csv_root+'train.csv', index=False)
test_df[columns_to_save].to_csv(csv_root+'test.csv', index=False)
val_df[columns_to_save].to_csv(csv_root+'valid.csv', index=False)

In [None]:
# Read CSV
train_csv = 'train.csv'
valid_csv = 'valid.csv'
test_csv = 'test.csv'
train_df = pd.read_csv(csv_root+train_csv)
valid_df = pd.read_csv(csv_root+valid_csv)
test_df = pd.read_csv(csv_root+test_csv)
print(train_df.head(3))
print(valid_df.head(3))
print(test_df.head(3))

In [None]:
# Prepare the Dataset in the Required Format for Vectorization and Modeling
X_train = train_df["text_pp"].astype(str)
X_test = test_df["text_pp"].astype(str)
y_train = train_df["rating"].astype(int)
y_test = test_df["rating"].astype(int)

In [None]:
# Apply the TF-IDF Vectorizer to Transform the Text Into a Vector Representation
# Set the Maximum Number of Features to 100000, the Minimum Document Frequency to 3, and Use 1-3 Grams
vectorizer = TfidfVectorizer(max_features=100000, min_df=3, ngram_range=(1,3))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Implement the Logistic Regression Model for Classification Prediction
# Set the Regularization Coefficient to 1.0, Use L1 Regularization, 
# Account for Uneven Label Distribution, and Allow Multiple Iterations for Sparse Data
# Training Without Errors or Warnings Indicates Normal Convergence
model = LogisticRegression(C=1.0, penalty="l1", class_weight='balanced', max_iter=100, solver='liblinear')
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)

In [None]:
# Evaluate the Model's Performance Using a Confusion Matrix and Calculate Accuracy
# Print the Confusion Matrix and Accuracy
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)
accuracy = np.sum(np.diag(cm)) / np.sum(cm)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
# Generate and Print the Classification Report for the Model's Performance
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Conduct a Live Test With a Sample Review Text: "Great Food and Service. Highly Recommended."
new_text = "Great food and service. Highly recommended."
new_text_pp = text_preprocessing(new_text)
print(new_text_pp)

# Transform the Preprocessed Text Into a TF-IDF Vector
new_vector = vectorizer.transform([new_text_pp])
print(new_vector)

# Predict the Rating for the New Text
new_pred = model.predict(new_vector)
print(new_pred)

# Print the Prediction Result
print("The predicted rating for the new text is:", new_pred[0])

In [None]:
# Now we will analyze the weight-feature word relationship.
# Examine the model's weights, which form a matrix with a shape of (number of categories * number of features)
model.coef_

In [None]:
# The second feature appears to have a significant downward trend; 
# Let's determine which feature corresponds to it
feature_names = vectorizer.get_feature_names()
print(feature_names[2])

In [None]:
# Find all indices with significant weights
indices = np.where(np.all(np.diff(model.coef_, axis=0) < 0, axis=0))[0]

In [None]:
# Print out the names of these features
selected_feature_names = np.take(feature_names, indices)
selected_feature_names

In [None]:
# Word Cloud Analysis
train_csv = 'train.csv'
df = pd.read_csv(csv_root+train_csv)

# Filter out reviews with ratings of 1 and 4-5
low_ratings = df[(df['rating'] <= 1)]
high_ratings = df[(df['rating'] >= 4) & (df['rating'] <= 5)]

# Merge the review text into one string
low_text = ' '.join(low_ratings['text_pp'].astype(str))
high_text = ' '.join(high_ratings['text_pp'].astype(str))

# Generate Word Clouds
low_wc = WordCloud().generate(low_text)
high_wc = WordCloud().generate(high_text)

In [None]:
# Display the word cloud for negative reviews
plt.imshow(low_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Display the word cloud for positive reviews
plt.imshow(high_wc, interpolation='bilinear')
plt.axis('off')
plt.show()