In [74]:
import os
import pickle
import numpy as np
import pandas as pd

In [75]:

def load_all_intermediate(folder='intermediate_results'):
    """
    Load all per-app_id intermediate results from disk into a single dictionary.
    
    Returns:
        similarity_results: dict
            Keys are app_id, values are dicts with 'reviews' and 'similarity_matrix'.
    """
    similarity_results = {}
    if not os.path.exists(folder):
        print(f"No folder named '{folder}' found.")
        return similarity_results

    for filename in os.listdir(folder):
        if filename.endswith('.pkl'):
            app_id = filename.replace('.pkl', '')
            file_path = os.path.join(folder, filename)
            with open(file_path, 'rb') as f:
                similarity_results[app_id] = pickle.load(f)
    return similarity_results

# -------------------------------
# Example usage:
# -------------------------------
similarity_results = load_all_intermediate('intermediate_results')
print(f"Loaded {len(similarity_results)} app_ids from intermediate results.")

Loaded 210 app_ids from intermediate results.


In [76]:
# -------------------------------
# Filter reviews by threshold (any tag)
# -------------------------------
def filter_reviews_by_threshold(similarity_results, threshold=0.5):
    allowed_reviews = []
    for app_id, data in similarity_results.items():
        sim_matrix = data['similarity_matrix']
        review_texts = data['reviews']

        # Boolean mask: any tag above threshold
        mask = np.any(sim_matrix > threshold, axis=1)

        for review, keep in zip(review_texts, mask):
            if keep:
                allowed_reviews.append((app_id, review))

    return allowed_reviews


In [85]:
# Step 2: Filter later by threshold
allowed_reviews = filter_reviews_by_threshold(similarity_results, threshold=0.4)
allowed_reviews_df = pd.DataFrame(allowed_reviews, columns=['app_id', 'review_text'])


In [86]:
games = len(similarity_results.keys())
print(games)
filterGames = len(np.unique(np.array(allowed_reviews_df['app_id'])))
print(filterGames)

210
210


In [87]:
print(allowed_reviews_df.shape)
allowed_reviews_df

(215733, 2)


Unnamed: 0,app_id,review_text
0,10,This will be more of a ''my experience with th...
1,10,• Do you like original games? • Do you like ga...
2,10,"Easy to learn, hard to master."
3,10,Still better than Call of Duty: Ghosts...
4,10,"Counter-Strike: Ok, after 9 years of unlimited..."
...,...,...
215728,80,Some pretty bad animations and models but it w...
215729,80,Pretty good game but...... Very hard to get ki...
215730,80,best graphics of the century
215731,80,My favorite game of all time! For some reason ...


In [88]:
allowed_reviews_df.to_csv('allowed_reviews.csv', index=False)