In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
recommendations_df = pd.read_csv('data/recommendations.csv')
games_df = pd.read_csv('data/games.csv')
users_df = pd.read_csv('data/users.csv')

In [3]:
# It turns out it is way too large: I will strategically split into mutliple smaller dataframes limit by 10 000 rows
def split_df_save(df, prefix, rows_thousand=1000):
    for i in range(0, len(df), 1000*rows_thousand):
        df.iloc[i:i+1000*rows_thousand].to_csv(f'data/split/{prefix}_{i//1000*rows_thousand}.csv', index=False)
        break # Just care about the first one for now

In [4]:
print(len(games_df))
print(len(users_df))
print(len(recommendations_df))

50872
14306064
41154794


In [9]:
split_df_save(users_df, 'users', rows_thousand=10000)
split_df_save(games_df, 'games', rows_thousand=10000)
split_df_save(recommendations_df, 'recommendations', rows_thousand=9000)

In [10]:
# Find if there is a correlation between the number of reviews and the number of products purchased by a user.
"""
games.csv

| Feature        | Variable Type | Description                 | Example            | Category   |
| -------------- | ------------- | --------------------------- | ------------------ | ---------- |
| app_id         | int           | Unique identifier of a game | 113020             | Identifier |
| title          | string        | Title of a game             | Escape Dead Island | Identifier |
| date_release   | date          | Release date of a game      | 2014-11-21         | Continuous |
| win            | bool          | Windows OS support          | True               | Category   |
| mac            | bool          | Mac OS support              | False              | Category   |
| linux          | bool          | Linux OS support            | False              | Category   |
| rating         | float         | Average rating of a game    | 3.0                | Continuous |
| positive ratio | float         | Ratio of positive reviews   | 0.5                | Continuous |
| user_reviews   | int           | Number of user reviews      | 0                  | Continuous |


recommendations.csv

| feature        | Variable Type | Description                      | Example    | Category   |
| -------------- | ------------- | -------------------------------- | ---------- | ---------- |
| app_id         | int           | Unique identifier of a game      | 113020     | Identifier |
| helpful        | int           | Number of helpful reviews        | 0          | Continuous |
| funny          | int           | Number of funny reviews          | 0          | Continuous |
| date           | date          | Date of a review                 | 2014-11-21 | Continuous |
| is_recommended | bool          | Whether a user recommends a game | True       | Category   |
| hours          | float         | Number of hours played           | 0.0        | Continuous |
| user_id        | int           | Unique identifier of a user      | 5250       | Identifier |
| review_id      | int           | Unique identifier of a review    | 1          | Identifier |

users.csv

| Feature  | Variable Type | Description                 | Example | Category   |
| -------- | ------------- | --------------------------- | ------- | ---------- |
| user_id  | int           | Unique identifier of a user | 5250    | Identifier |
| products | int           | Count of purchased products | 1       | Continuous |
| reviews  | int           | Count of published reviews  | 0       | Continuous |
"""


'\ngames.csv\n\n| Feature        | Variable Type | Description                 | Example            | Category   |\n| -------------- | ------------- | --------------------------- | ------------------ | ---------- |\n| app_id         | int           | Unique identifier of a game | 113020             | Identifier |\n| title          | string        | Title of a game             | Escape Dead Island | Identifier |\n| date_release   | date          | Release date of a game      | 2014-11-21         | Continuous |\n| win            | bool          | Windows OS support          | True               | Category   |\n| mac            | bool          | Mac OS support              | False              | Category   |\n| linux          | bool          | Linux OS support            | False              | Category   |\n| rating         | float         | Average rating of a game    | 3.0                | Continuous |\n| positive ratio | float         | Ratio of positive reviews   | 0.5                

In [11]:

recommendations_df = pd.read_csv('data/split/recommendations_0.csv')
games_df = pd.read_csv('data/split/games_0.csv')
users_df = pd.read_csv('data/split/users_0.csv')

In [12]:
# Ensure integrity by joining removing all games and users that were not in the transactions

# get unique users from recommendations
unique_users = recommendations_df['user_id'].unique()
unique_games = recommendations_df['app_id'].unique()


# Remove all users and games that are not in the recommendations
users_df = users_df[users_df['user_id'].isin(unique_users)]
games_df = games_df[games_df['app_id'].isin(unique_games)]

available_users = users_df['user_id'].unique()
available_games = games_df['app_id'].unique()
# Remove from transactions where user or game is not available
recommendations_df = recommendations_df[recommendations_df['user_id'].isin(available_users)]
recommendations_df = recommendations_df[recommendations_df['app_id'].isin(available_games)]

print(len(users_df), len(games_df), len(recommendations_df))

# Save to /downsized
users_df.to_csv('data/downsized/users.csv', index=False)
games_df.to_csv('data/downsized/games.csv', index=False)
recommendations_df.to_csv('data/downsized/recommendations.csv', index=False)


4860086 2494 9000000
