In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report
import pandas as pd

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.manifold import TSNE

from seaborn import heatmap
import seaborn as sns
from scipy.sparse import hstack

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jhsualv/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
def compute_bow(documents, max_features):
    """
    Computes the bag-of-words matrix for the given documents.

    Parameters:
    - documents: List of text documents.
    - max_features: Maximum number of features (vocabulary size).

    Returns:
    - bow_matrix: Sparse matrix of shape (n_samples, n_features).
    """

    vectorizer = CountVectorizer(max_features=max_features)
    bow_matrix = vectorizer.fit_transform(documents)
    return bow_matrix

In [3]:
def compute_tfidf(documents, max_features):
    """
    Computes the tfidf matrix for the given documents.

    Parameters:
    - documents: List of text documents.
    - max_features: Maximum number of features to use.

    Returns:
    - tfidf_matrix: Sparse matrix of shape (n_samples, n_features).
    """
   
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()  # Extract feature names
    return tfidf_matrix, feature_names

In [4]:
def apply_lsa(tfidf_matrix, n_components):
    """
    Applies LSA (using TruncatedSVD) to the tfidf matrix.

    Parameters:
    - tfidf_matrix: Sparse matrix from tfidf vectorization.
    - n_components: Number of components to keep.

    Returns:
    - lsa_matrix: Dense matrix with reduced dimensions.
    """
 
    vectorizer = TruncatedSVD(n_components=n_components)
    lsa_matrix = vectorizer.fit_transform(tfidf_matrix)
    return lsa_matrix

In [5]:
def create_documents(df):
  documents = df['review'].tolist()
  return documents

In [6]:
def create_labels(df):
  labels = df['voted_up'].tolist()
  return labels

In [7]:
def compute_vader_sentiment_scores(documents):
    analyzer = SentimentIntensityAnalyzer()
    scores = []

    for doc in documents:
        sentiment = analyzer.polarity_scores(doc)
        scores.append(sentiment)  # This is a dict with 'neg', 'neu', 'pos', 'compound'

    return pd.DataFrame(scores)


In [8]:
max_features = 1000

In [9]:
df = pd.read_csv('cleaned_data.csv')
documents = create_documents(df)

#vader_scores_df = compute_vader_sentiment_scores(documents) #create the scores (and for now do literally nothing with them)
#df = pd.concat([df, vader_scores_df], axis=1)

labels = create_labels(df)

In [10]:
# version check on imports 
import transformers
import accelerate
print(transformers.__version__)
print(accelerate.__version__)

4.51.3
1.6.0


In [11]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch

# Prepare documents and labels into a huggingface dataset
df_for_bert = pd.DataFrame({'text': documents, 'label': labels})

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_for_bert['text'].tolist(), df_for_bert['label'].tolist(), 
    test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove raw text after tokenization
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset.set_format("torch")
val_dataset.set_format("torch")

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions)
    }

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate
results = trainer.evaluate()
print(results)


Map:   0%|          | 0/253 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2574,,0.078125,0.0
2,0.0,,0.078125,0.0
3,0.0,,0.078125,0.0




{'eval_loss': nan, 'eval_accuracy': 0.078125, 'eval_f1': 0.0, 'eval_runtime': 3.2628, 'eval_samples_per_second': 19.615, 'eval_steps_per_second': 0.306, 'epoch': 3.0}


In [None]:
# Save the model
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

In [None]:

def compute_linear_regression(tfidf_matrix, labels, test_size=0.2, random_state=42, plot_confusion_matrix=False):
    """
    Trains a linear regression model to predict review votes and evaluates it.

    Parameters:
    - tfidf_matrix: Sparse matrix of features (from TF-IDF).
    - labels: Binary labels (0 = not voted up, 1 = voted up).
    - test_size: Proportion of the dataset to include in the test split.
    - random_state: Seed for reproducibility.
    - plot_confusion_matrix: Whether to plot the confusion matrix.

    Returns:
    - model: Trained Linear Regression model.
    - mse: Mean Squared Error on the test set.
    - accuracy: Accuracy on the test set after thresholding at 0.5.
    - confusion_mat: Confusion matrix.
    - class_report: Text summary of precision, recall, f1-score.
    """
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        tfidf_matrix, labels, test_size=test_size, random_state=random_state
    )
    
    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Compute Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    
    # Threshold predictions at 0.5 for classification
    y_pred_class = (y_pred >= 0.5).astype(int)
    
    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_class)
    
    # Compute Confusion Matrix
    confusion_mat = confusion_matrix(y_test, y_pred_class)
    
    # Compute Classification Report
    class_report = classification_report(y_test, y_pred_class)
    
    print("Mean Squared Error:", mse)
    print("Accuracy:", accuracy)
    print("\nConfusion Matrix:\n", confusion_mat)
    print("\nClassification Report:\n", class_report)
    
    # Optionally plot confusion matrix
    if plot_confusion_matrix:
        sns.heatmap(confusion_mat, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix")
        plt.show()
    
    return model, mse, accuracy, confusion_mat, class_report


In [None]:
model, mse, acc, cm, report = compute_linear_regression(tfidf_matrix, labels, plot_confusion_matrix=True)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB



from xgboost import XGBClassifier



In [None]:
def evaluate_naive_bayes(X, y, test_size=0.2):
    """
    Splits the dataset, trains a Naive Bayes classifier (MultinomialNB),
    and evaluates performance.

    Parameters:
    - X: Feature matrix (may not allow LSA features because negative values).
    - y: Ground truth labels.
    - test_size: Fraction of data to use for testing.

    Returns:
    - accuracy: Accuracy score on the test set.
    - nb_model: The trained Naive Bayes model.
    """
    # 1. Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # 2. Instantiate Model
    nb_model = MultinomialNB()
    #nb_model = GaussianNB()        # uncomment for lsa tfidf

    # 3. Train
    nb_model.fit(X_train, y_train)

    # 4. Predict
    y_pred = nb_model.predict(X_test)

    # 5. Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print("Naive Bayes Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, nb_model

In [None]:
# Run everything

y = create_labels(df)

bow_matrix = compute_bow(documents, max_features)
X_tfidf, feature_names = compute_tfidf(documents, max_features)
X_lsa = apply_lsa(X_tfidf, 100)
vader_scores = compute_vader_sentiment_scores(documents) #create the scores

X_tfidf = pd.DataFrame(X_tfidf.toarray())
bow_matrix = pd.DataFrame(bow_matrix.toarray())

vader_scores_shifted = vader_scores.copy()
for col in ['pos', 'neu', 'neg', 'compound']:
    vader_scores_shifted[col] += 1

# Make sparse
vader_scores_sparse = vader_scores_shifted.astype(pd.SparseDtype("float", fill_value=0))

#vader_scores = vader_scores.astype(pd.SparseDtype("float", fill_value=0))
X_tfidf_vader = pd.concat([X_tfidf, vader_scores_sparse], axis=1)
X_tfidf_vader.columns = X_tfidf_vader.columns.astype(str)
#X_tfidf_vader = hstack([X_tfidf, vader_scores.sparse.to_coo()])
bow_matrix_vader = pd.concat([bow_matrix, vader_scores_sparse], axis=1)
bow_matrix_vader.columns = bow_matrix_vader.columns.astype(str)
#bow_matrix_vader = hstack([X_tfidf, vader_scores.sparse.to_coo()])

#NB for tf-idf
print("TF-IDF NB:")
acc_nb, model_nb = evaluate_naive_bayes(X_tfidf, y)

print("TF-IDF NB with VADER:")
acc_nb, model_nb = evaluate_naive_bayes(X_tfidf_vader, y)

# NB for BoW
print("BoW NB: ")
acc_nb_bow, model_nb_bow = evaluate_naive_bayes(bow_matrix, y)

print("BoW NB with VADER: ")
acc_nb_bow, model_nb_bow = evaluate_naive_bayes(bow_matrix_vader, y)

In [None]:
from wordcloud import WordCloud
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Get the log probabilities of features for each class
# Assuming model_nb is a trained MultinomialNB or similar Naive Bayes model
class_log_probs = model_nb.feature_log_prob_

# Focus on one class (e.g., class 1)
class_index = 1  # Change to 0 or 1 depending on the class you want to visualize
feature_importances = np.exp(class_log_probs[class_index])  # Convert log probabilities to probabilities

# Create a dictionary mapping words to their importance
word_importance = {feature_names[i]: feature_importances[i] for i in range(len(feature_names))}

# Filter out stop words
filtered_word_importance = {
    word: importance
    for word, importance in word_importance.items()
    if word not in ENGLISH_STOP_WORDS
}

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(filtered_word_importance)


# title based on class index

title = "Word Cloud of Important Features for Positive Reviews" if class_index == 1 else "Word Cloud of Important Features for Negative Reviews"


# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()

In [None]:


# Convert the adjective importance dictionary to a DataFrame
adjective_df = pd.DataFrame(list(filtered_word_importance.items()), columns=['Word', 'Importance'])

# Sort by importance and select the top N features
top_n = 20
top_adjectives = adjective_df.sort_values(by='Importance', ascending=False).head(top_n)

# Plot the bar chart
plt.figure(figsize=(12, 6))
plt.barh(top_adjectives['Word'], top_adjectives['Importance'], color='skyblue')
plt.gca().invert_yaxis()  # Invert y-axis to show the highest importance at the top
plt.xlabel('Importance')
plt.ylabel('Words')
plt.title(f'Top {top_n} Features')
plt.show()

In [None]:

from nltk import pos_tag
from nltk.corpus import wordnet

# Download NLTK resources if not already downloaded
nltk.download('all')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Get the log probabilities of features for each class
# Assuming model_nb is a trained MultinomialNB or similar Naive Bayes model
class_log_probs = model_nb.feature_log_prob_

# Focus on one class (e.g., class 1)

feature_importances = np.exp(class_log_probs[class_index])  # Convert log probabilities to probabilities

# Create a dictionary mapping words to their importance
word_importance = {feature_names[i]: feature_importances[i] for i in range(len(feature_names))}

# Filter out stop words
filtered_word_importance = {
    word: importance
    for word, importance in word_importance.items()
    if word not in ENGLISH_STOP_WORDS
}

# Filter to include only adjectives
def is_adjective(word):
    pos = pos_tag([word])[0][1]  # Get the part of speech tag
    return pos.startswith('JJ')  # Adjectives in NLTK are tagged as 'JJ', 'JJR', or 'JJS'

adjective_word_importance = {
    word: importance
    for word, importance in filtered_word_importance.items()
    if is_adjective(word)
}

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(adjective_word_importance)


# title based on class index
title = "Word Cloud of Important Features for Positive Reviews" if class_index == 1 else "Word Cloud of Important Features for Negative Reviews"


# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()

In [None]:


# Convert the adjective importance dictionary to a DataFrame
adjective_df = pd.DataFrame(list(adjective_word_importance.items()), columns=['Word', 'Importance'])

# Sort by importance and select the top N features
top_n = 20
top_adjectives = adjective_df.sort_values(by='Importance', ascending=False).head(top_n)

# Plot the bar chart
plt.figure(figsize=(12, 6))
plt.barh(top_adjectives['Word'], top_adjectives['Importance'], color='skyblue')
plt.gca().invert_yaxis()  # Invert y-axis to show the highest importance at the top
plt.xlabel('Importance')
plt.ylabel('Adjectives')
plt.title(f'Top {top_n} Adjectives')
plt.show()

In [None]:
# Load the CSV file
file_path = "cleaned_data.csv"  # Update this with the correct path if needed
df = pd.read_csv(file_path)

# Calculate the length of each review
df['review_length'] = df['review'].astype(str).apply(len)

# Group by 'voted_up' and calculate the average review length
avg_review_length = df.groupby('voted_up')['review_length'].median()

# Plot the bar chart
plt.figure(figsize=(8, 6))
avg_review_length.plot(kind='bar', color=['skyblue', 'orange'], edgecolor='black')

# Update the x-axis labels
plt.title('Median Review Length by Vote Status')
plt.xlabel('Vote Status')
plt.ylabel('Average Review Length')
plt.xticks(ticks=[0, 1], labels=['Voted Down', 'Voted Up'], rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:


# Calculate the length of each review
df['review_length'] = df['review'].astype(str).apply(len)

# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(
    df[df['voted_up'] == 0]['review_length'],  # Review lengths for 'Not Voted Up'
    [0] * len(df[df['voted_up'] == 0]),       # Corresponding y-values (voted_up = 0)
    color='#FF9999', alpha=0.6, label='Voted Down'
)
plt.scatter(
    df[df['voted_up'] == 1]['review_length'],  # Review lengths for 'Voted Up'
    [1] * len(df[df['voted_up'] == 1]),       # Corresponding y-values (voted_up = 1)
    color='#66B3FF', alpha=0.6, label='Voted Up'
)

# Add titles and labels
plt.title('Scatter Plot of Review Length vs. Vote Status', fontsize=14, fontweight='bold')
plt.xlabel('Review Length', fontsize=12)
plt.ylabel('Voted Up Status', fontsize=12)
plt.yticks([0, 1], ['Voted Down', 'Voted Up'])  # Custom y-axis labels
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()

# Show the plot
plt.show()

# t-SNE Visualizations (2D, 3D)

In [None]:
# Reduce to 2D with t-SNE
tsne = TSNE(n_components=2, random_state=130, perplexity=30)
X_tsne = tsne.fit_transform(lsa_matrix)

# Plot the results
plt.figure(figsize=(10,7))

# Create the scatter plot
scatter2D = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='coolwarm', alpha=0.7, s=10)

plt.colorbar(scatter2D, label='Recommended (1) / Not Recommended (0)')
plt.title('t-SNE Visualization of Steam Review Embeddings')
plt.grid(True)
plt.show()

In [None]:

# t-SNE with 3 components, fit using LSA-reduced matrix
tsne_3d = TSNE(n_components=3, random_state=130, perplexity=30)
X_tsne_3d = tsne_3d.fit_transform(lsa_matrix)

# Plot the results
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Create the scatter plot
scatter3D = ax.scatter(X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2], c=labels, cmap='coolwarm', alpha=0.7, s=15)

ax.set_title("3D t-SNE Visualization of Steam Reviews")
fig.colorbar(scatter3D, label='Recommended (1) / Not Recommended (0)')

plt.show()


# Top-N Feature Plot

In [None]:
# Get the log probabilities of features
class_log_probs = model_nb.feature_log_prob_

# Get the difference between class 1 and class 0 (Recommended vs Not Recommended)
log_prob_diff = class_log_probs[1] - class_log_probs[0]  # Higher = more likely to be in recommended reviews

# Map words to importance difference and filter out common/junk words
word_importance = {feature_names[i]: log_prob_diff[i] for i in range(len(feature_names))}

custom_stopwords = ENGLISH_STOP_WORDS.union({'game', 'games', 'player', 'players'})
filtered_word_importance = {word: importance for word, importance in word_importance.items() if word.lower() not in custom_stopwords and word.isalpha() and len(word) > 2}

# Create DataFrame of word importances
importance_df = pd.DataFrame(list(filtered_word_importance.items()), columns=['Word', 'Importance'])

# Get the Top N positive (recommended) and Top N negative (not recommended) words
top_n = 20
top_positive = importance_df.sort_values(by='Importance', ascending=False).head(top_n)
top_negative = importance_df.sort_values(by='Importance', ascending=True).head(top_n)

# Plot
plt.figure(figsize=(12, 6))
plt.barh(top_positive['Word'], top_positive['Importance'], color='green', label='Recommended')
plt.barh(top_negative['Word'], top_negative['Importance'], color='red', label='Not Recommended')
plt.xlabel('Log-Probability Difference')
plt.title('Top Informative Words for Sentiment')
plt.legend()
plt.gca().invert_yaxis()
plt.show()

# BoW Top-N Feature Plot

In [None]:
# Get feature names from BoW vectorizer
bow_vectorizer = CountVectorizer(max_features=max_features)
bow_vectorizer.fit(documents)
bow_feature_names = bow_vectorizer.get_feature_names_out()

# Get feature importance (difference between class log probs)
class_log_probs_bow = model_nb_bow.feature_log_prob_
log_prob_diff_bow = class_log_probs_bow[1] - class_log_probs_bow[0]

# Build dataframe
word_importance_bow = {bow_feature_names[i]: log_prob_diff_bow[i] for i in range(len(bow_feature_names))}

# Filter out common stopwords
filtered_word_importance_bow = {word: importance for word, importance in word_importance_bow.items() if word.lower() not in custom_stopwords and word.isalpha() and len(word) > 2}

importance_df_bow = pd.DataFrame(
    list(filtered_word_importance_bow.items()), columns=['Word', 'Importance']
)

# Plot Top-N Features
top_n = 20
top_positive_bow = importance_df_bow.sort_values(by='Importance', ascending=False).head(top_n)
top_negative_bow = importance_df_bow.sort_values(by='Importance', ascending=True).head(top_n)

plt.figure(figsize=(10, 8))
plt.barh(top_negative_bow['Word'], top_negative_bow['Importance'], color='red', label='Not Recommended')
plt.barh(top_positive_bow['Word'], top_positive_bow['Importance'], color='green', label='Recommended')
plt.axvline(0, color='black')
plt.xlabel('Log-Probability Difference')
plt.title(f'Top {top_n} Most Informative Words (BoW)')
plt.legend()
plt.tight_layout()
plt.show()

# BoW WordCloud

In [None]:
# Get log probs of the features for each class
class_log_probs_bow = model_nb_bow.feature_log_prob_

# Focus on a single class
class_index = 1  # 1 = recommended, 0 = not recommended
feature_importances = np.exp(class_log_probs[class_index])  # convert log-probs to probs

# Create word importance dictionary
word_importance = {bow_feature_names[i]: feature_importances[i] for i in range(len(bow_feature_names))}

# Filter out common stopwords + junk
filtered_word_importance = {word: importance for word, importance in word_importance.items() if word.lower() not in custom_stopwords and word.isalpha() and len(word) > 2}

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(filtered_word_importance)

# Plot
title = "Word Cloud of Important Features for Positive Reviews (BoW)" if class_index == 1 else "Word Cloud of Important Features for Negative Reviews"

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()
