In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import random as r
import matplotlib.pyplot as plt
from collections import Counter
import tqdm
from sklearn.base import clone
import ast

from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# -----------------------------
# 1. Load your datasets
# -----------------------------
print("Loading datasets...")
reviews_df = pd.read_csv("./archive/dataset.csv")  # columns: app_id, app_name, review_text
reviews_df['review_text'] = reviews_df['review_text'].astype(str)
print(reviews_df.head(3))

tags_df = pd.read_csv("comp/tagGamesNoCompoundDF.csv")
tagCountsDF = pd.read_csv("comp/tagCountsDF.csv")
tags_df.head(10)


Loading datasets...
   app_id        app_name                                        review_text  \
0      10  Counter-Strike                                    Ruined my life.   
1      10  Counter-Strike  This will be more of a ''my experience with th...   
2      10  Counter-Strike                      This game saved my virginity.   

   review_score  review_votes  
0             1             0  
1             1             1  
2             1             0  


Unnamed: 0,app_id,tags
0,10,"['action', 'first-person', 'shooter', 'multipl..."
1,1002,"['2d', 'fighting', 'martial arts', 'intentiona..."
2,100400,"['animation & modeling', 'software']"
3,10090,"['zombies', 'world war ii', 'first-person', 's..."
4,100980,"['animation & modeling', 'utilities', 'design ..."
5,10100,"['adventure', 'point & click', 'classic', 'fan..."
6,10110,"['adventure', 'point & click', 'classic', 'com..."
7,10120,"['action', 'casual', 'space', 'on-rails shoote..."
8,10130,"['action', 'first-person', 'shooter', 'time ma..."
9,10140,"['sports', 'mini golf', 'golf', 'local multipl..."


In [5]:
# -------------------------------
# Initialize model
# -------------------------------
model = SentenceTransformer('all-MiniLM-L6-v2')

tag_embeddings = model.encode(tags_df['tags'].tolist())

In [None]:

# -------------------------------
# Helper: encode in batches
# -------------------------------
def encode_in_batches(texts, batch_size=512, model=model):
    embeddings = []
    for start in range(0, len(texts), batch_size):
        end = start + batch_size
        batch = texts[start:end]
        emb_batch = model.encode(batch, show_progress_bar=False)
        embeddings.append(emb_batch)
    return np.vstack(embeddings)

# -------------------------------
# Helper: save intermediate results
# -------------------------------
def save_intermediate(app_id, data, folder='intermediate_results'):
    os.makedirs(folder, exist_ok=True)
    file_path = os.path.join(folder, f'{app_id}.pkl')
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

# -------------------------------
# Helper: load intermediate if exists
# -------------------------------
def load_intermediate(app_id, folder='intermediate_results'):
    file_path = os.path.join(folder, f'{app_id}.pkl')
    if os.path.exists(file_path):
        with open(file_path, 'rb') as f:
            return pickle.load(f)
    return None

def alDone(reviews_df):
    done = []
    for app_id in np.unique(reviews_df['app_id']):
        if os.path.exists(f'intermediate_results/{app_id}.pkl'):
            done.append(app_id)
    return set(done)

# -------------------------------
# Compute similarities per app_id in batches
# -------------------------------
def compute_similarity_per_app(reviews_df, tag_embeddings):

    results = {}  # store similarities per app_id
    done = alDone(reviews_df)

    for app_id, group in tqdm.tqdm((reviews_df.groupby('app_id')), "processing"):
        # Try to load intermediate result
        # data = load_intermediate(app_id)
        if app_id in done:
            # results[app_id] = data
            continue

        review_texts = group['review_text'].tolist()
        # print("embedding", app_id)
        review_embeddings = model.encode(review_texts, show_progress_bar=False)

        # Compute cosine similarity (reviews x tags)
        sim_matrix = cosine_similarity(review_embeddings, tag_embeddings)

        # Store full similarity matrix for later threshold filtering
        results[app_id] = {
            'reviews': review_texts,
            'similarity_matrix': sim_matrix
        }

        # Save intermediate result per app_id
        save_intermediate(app_id, results[app_id])

    return results

# -------------------------------
# Filter reviews by threshold (any tag)
# -------------------------------
def filter_reviews_by_threshold(similarity_results, threshold=0.5):
    allowed_reviews = []
    for app_id, data in similarity_results.items():
        sim_matrix = data['similarity_matrix']
        review_texts = data['reviews']

        # Boolean mask: any tag above threshold
        mask = np.any(sim_matrix > threshold, axis=1)

        for review, keep in zip(review_texts, mask):
            if keep:
                allowed_reviews.append((app_id, review))

    return allowed_reviews

# -------------------------------
# Example Usage
# -------------------------------
# Step 1: Compute and store similarities per app_id
similarity_results = compute_similarity_per_app(reviews_df, tag_embeddings)

# Step 2: Filter later by threshold
allowed_reviews = filter_reviews_by_threshold(similarity_results, threshold=0.5)
allowed_reviews_df = pd.DataFrame(allowed_reviews, columns=['app_id', 'review_text'])


  2%|▏         | 164/9972 [06:08<31:11:36, 11.45s/it]

In [None]:
def count_removed_reviews(original_df, allowed_df):
    # Count total reviews per app_id
    total_counts = original_df.groupby('app_id').size().rename('total_reviews')
    
    # Count allowed reviews per app_id
    allowed_counts = allowed_df.groupby('app_id').size().rename('allowed_reviews')
    
    # Combine counts
    comparison = pd.concat([total_counts, allowed_counts], axis=1).fillna(0)
    
    # Compute removed reviews
    comparison['removed_reviews'] = comparison['total_reviews'] - comparison['allowed_reviews']
    
    return comparison.reset_index()

# Example usage:
removed_summary = count_removed_reviews(reviews_df, allowed_reviews)
print(removed_summary)


In [None]:
# -----------------------------
# 2. Aggregate reviews per game (only games with tags available)
# -----------------------------

def aggregate_reviews_per_game(reviews_df: pd.DataFrame, tags_df: pd.DataFrame) -> pd.DataFrame:
    print("Aggregating reviews per game...")

    # Standardize key column name
    reviews_df["app_id"] = reviews_df["app_id"].astype(int, copy=False)
    tags_df["app_id"] = tags_df["app_id"].astype(int, copy=False)

    # Filter reviews that have tags
    reviews_with_tags = (
        reviews_df.loc[reviews_df['app_id'].isin(tags_df['app_id']), ['app_id', 'app_name', 'review_text']]
        .assign(review_text=lambda df: df['review_text'].astype(str))
    )

    # Filter short reviews
    reviews_with_tags['review_text'] = reviews_with_tags['review_text'].astype(str)
    reviews_with_tags = reviews_with_tags.loc[
        reviews_with_tags['review_text'].str.count(r'\S+') >= 3
    ]
    print(f"Filtered reviews shape: {reviews_with_tags.shape}")

    # Aggregate per game
    game_reviews = (
        reviews_with_tags
        .groupby("app_id", as_index=False)
        .agg({
            'review_text': lambda texts: " ".join(texts),
            'app_name': 'first'
        })
    )
    print(f"Aggregated reviews shape: {game_reviews.shape}")

    # Merge and parse tags
    game_reviews = game_reviews.merge(tags_df, on="app_id", how='inner')
    game_reviews['tags'] = game_reviews['tags'].apply(ast.literal_eval)

    print(f"Merged game_reviews shape: {game_reviews.shape}")
    print("Sample of final aggregated data:")
    print(game_reviews.head(3))

    return game_reviews

game_reviews = aggregate_reviews_per_game(reviews_df, tags_df)