Create dataset from new heartless dataset (20240116)

Select a game for testing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns


from pathlib import Path
import random

random.seed(13)

dataset_heartless_path = Path('../../dataset/sa/dataset_heartless_20240116.pkl').resolve()

dataset = pd.read_pickle(dataset_heartless_path)
# dataset = dataset.sample(frac=p)      # no sampling is needed

# convert the text to string object
dataset['review_text'] = dataset['review_text'].astype('str')

# drop any duplicate just in case
dataset = dataset.drop_duplicates(keep='first', subset=['review_text', 'review_score', 'review_votes', 'app_id'])

# replace -1 to 0
# then 0 = negative, 1 = positive
# for easier processing
dataset['review_score'] = dataset['review_score'].replace(-1, 0)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4180148 entries, 0 to 4180147
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 191.4+ MB


---

Get unique reviews by 'review_text'

In [2]:
dataset_cp = dataset.copy()

# remove duplicate reviews by the same review in each game
dataset_cp['review_text'] = dataset_cp['review_text'].str.strip()

unique_list = ['review_text']
dataset_cp = dataset_cp.drop_duplicates(keep='first', subset=unique_list)

print(dataset_cp.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
Index: 4045065 entries, 0 to 4180147
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 216.0+ MB
None


In [5]:
# save the dataset

save_path = Path(f'../topic_modelling/00_dataset_filtered_all_{len(dataset_cp)}.pkl').resolve()
dataset_cp.to_pickle(save_path)

---

Get unique reviews by games

In [6]:
# check unique games and sort by number of reviews

dataset_games = dataset.copy()

# get the unique game names
unique_games_id = dataset_games['app_id'].unique()

# remove duplicate reviews by the same review in each game
dataset_games['review_text'] = dataset_games['review_text'].str.strip()

# unique_list = ['app_id', 'review_text']
unique_list = ['review_text']
dataset_games = dataset_games.drop_duplicates(subset=unique_list, keep='first')

# get the number of reviews for each game
unique_games_review_count = dataset_games.groupby(['app_id', 'app_name']).size().reset_index(name='review_count')

# sort the games by number of reviews
unique_games_review_count = unique_games_review_count.sort_values(by=['review_count'], ascending=False)

# get the top 10 games with most reviews
top_10_games = unique_games_review_count.head(10)

top_10_games


Unnamed: 0,app_id,app_name,review_count
1156,105600,Terraria,74368
1526,218620,PAYDAY 2,54075
5774,391540,Undertale,44657
22,570,Dota 2,41644
1691,230410,Warframe,40677
2092,252950,Rocket League,34902
21,550,Left 4 Dead 2,31202
240,8870,BioShock Infinite,26898
1163,107410,Arma 3,26707
23,620,Portal 2,26089


In [7]:
# save the reviews of each top 10 games to a separate pkl file

for n, (index, row) in enumerate(top_10_games.iterrows()):
    app_id = row['app_id']
    app_name = row['app_name']
    print(f'Processing {app_name}...')
    game_reviews = dataset.loc[dataset['app_id'] == app_id]

    save_path = Path(f'../topic_modelling/top_10_games_unique_[{",".join(unique_list)}]/{n:02}_{app_name}.pkl').resolve()
    if not save_path.parent.exists():
        save_path.parent.mkdir(parents=True)

    if save_path.exists():
        print(f'File {save_path} already exists, skipping...')
        continue
            
    game_reviews.to_pickle(save_path)

Processing Terraria...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_10_games_unique_[review_text]/00_Terraria.pkl already exists, skipping...
Processing PAYDAY 2...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_10_games_unique_[review_text]/01_PAYDAY 2.pkl already exists, skipping...
Processing Undertale...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_10_games_unique_[review_text]/02_Undertale.pkl already exists, skipping...
Processing Dota 2...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_10_games_unique_[review_text]/03_Dota 2.pkl already exists, skipping...
Processing Warframe...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_10_games_unique_[review_text]/04_Warframe.pkl already exists, skipping...
Processing Rocket League...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_10_games_unique_[review_text]/05_Rocket League.pkl already exists, skipping...
Processing Left 4 Dead 2...
File /root/FYP/N

---

Read the games available on steam

so that we can save reviews by genre or by category

First get the games scraped from Steam to identify the genres

In [8]:
# read the steam game list pickle (a json)

import pickle

steam_game_list_path = Path('../../dataset/steam-games/Steam Gamelist/apps_dict-ckpt-20231017143419.p').resolve()
excluded_apps_list_path = Path('../../dataset/steam-games/Steam Gamelist/excluded_apps_list-ckpt-20231017143419.p').resolve()

steam_game_list = pickle.load(open(steam_game_list_path, "rb"))
excluded_apps_list = pickle.load(open(excluded_apps_list_path, "rb"))

steam_game_list

{'2319510': {'type': 'game',
  'name': '幻想乡之玉 Playtest',
  'steam_appid': 2319510,
  'required_age': 0,
  'is_free': False,
  'detailed_description': '',
  'about_the_game': '',
  'short_description': '',
  'header_image': 'https://cdn.akamai.steamstatic.com/steam/apps/2319510/header.jpg?t=1693504538',
  'capsule_image': 'https://cdn.akamai.steamstatic.com/steam/apps/2319510/capsule_231x87.jpg?t=1693504538',
  'capsule_imagev5': 'https://cdn.akamai.steamstatic.com/steam/apps/2319510/capsule_184x69.jpg?t=1693504538',
  'website': None,
  'pc_requirements': [],
  'mac_requirements': [],
  'linux_requirements': [],
  'publishers': [''],
  'package_groups': [],
  'platforms': {'windows': True, 'mac': False, 'linux': False},
  'release_date': {'coming_soon': False, 'date': '31 Aug, 2023'},
  'support_info': {'url': '', 'email': ''},
  'background': '',
  'background_raw': '',
  'content_descriptors': {'ids': [], 'notes': None}},
 '2319550': {'type': 'game',
  'name': 'Spectrum Forces',
  's

In [9]:
genres = []
categories = []

number_of_games_with_genres = 0
number_of_games_with_categories = 0

for app_id, data in steam_game_list.items():
    if 'genres' in data:
        for genre in data['genres']:
            genres.append(genre)

        number_of_games_with_genres += 1


    # genres.append(data['genres'])
    if 'categories' in data:
        for category in data['categories']:
            categories.append(category)

        number_of_games_with_categories += 1
    # categories.append(data['categories'])
        
print("Total number of apps:", len(steam_game_list))
print("Number of apps with genres:", number_of_games_with_genres)
print("Number of apps with categories:", number_of_games_with_categories)

Total number of apps: 157068
Number of apps with genres: 139119
Number of apps with categories: 148299


In [10]:
# create a dataframe from the genres_dict
genres_dict = {'id': [], 'description': []}
for genre in genres:
    genres_dict['id'].append(genre['id'])
    genres_dict['description'].append(genre['description'])

genres_df = pd.DataFrame.from_dict(genres_dict)
genres_df.drop_duplicates(subset='id', keep='first', inplace=True)
genres_df['id'] = genres_df['id'].astype('int8')
genres_df.sort_values(by='id', inplace=True)
genres_df.reset_index(drop=True, inplace=True)
# reset the id to genre_id
genres_df.rename(columns={'id': 'genre_id'}, inplace=True)


genres_df

Unnamed: 0,genre_id,description
0,1,Action
1,2,Strategy
2,3,RPG
3,4,Casual
4,9,Racing
5,18,Sports
6,23,Indie
7,25,Adventure
8,28,Simulation
9,29,Massively Multiplayer


In [11]:
# create a dataframe from the categories_dict

categories_dict = {'id': [], 'description': []}
for category in categories:
    categories_dict['id'].append(category['id'])
    categories_dict['description'].append(category['description'])

categories_df = pd.DataFrame.from_dict(categories_dict)
categories_df.drop_duplicates(subset='id', keep='first', inplace=True)
categories_df['id'] = categories_df['id'].astype('int8')
categories_df.sort_values(by='id', inplace=True)
categories_df.reset_index(drop=True, inplace=True)
# rename column id to category_id
categories_df.rename(columns={'id': 'category_id'}, inplace=True)

categories_df

Unnamed: 0,category_id,description
0,1,Multi-player
1,2,Single-player
2,6,Mods (require HL2)
3,8,Valve Anti-Cheat enabled
4,9,Co-op
5,10,Game demo
6,13,Captions available
7,14,Commentary available
8,15,Stats
9,16,Includes Source SDK


---

Get the genre id and the category id of the game mentioned in each review

In [21]:
steam_app_id_set = set(steam_game_list.keys())

number_of_reviews_without_genre = 0
number_of_reviews_without_category = 0

# base on the id of the game, add all genre id and category id to the dataset

def get_all_genres(app_id):
    if str(app_id) in steam_app_id_set and 'genres' in steam_game_list[str(app_id)]:
        return [genre['id'] for genre in steam_game_list[str(app_id)]['genres']]
    else:
        global number_of_reviews_without_genre
        number_of_reviews_without_genre += 1
        return []
    
def get_all_categories(app_id):
    if str(app_id) in steam_app_id_set and 'categories' in steam_game_list[str(app_id)]:
        return [category['id'] for category in steam_game_list[str(app_id)]['categories']]
    else:
        global number_of_reviews_without_category
        number_of_reviews_without_category += 1
        return []


dataset['genre_id'] = dataset['app_id'].map(lambda x: get_all_genres(x))
dataset['category_id'] = dataset['app_id'].map(lambda x: get_all_categories(x))

dataset.head(10)

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id
0,0,10,Counter-Strike,Ruined my life.,1,0,[1],"[1, 49, 36, 37, 8]"
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,[1],"[1, 49, 36, 37, 8]"
2,2,10,Counter-Strike,This game saved my virginity.,1,0,[1],"[1, 49, 36, 37, 8]"
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,[1],"[1, 49, 36, 37, 8]"
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,[1],"[1, 49, 36, 37, 8]"
5,5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1,[1],"[1, 49, 36, 37, 8]"
6,6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1,[1],"[1, 49, 36, 37, 8]"
7,7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1,[1],"[1, 49, 36, 37, 8]"
8,8,10,Counter-Strike,"Counter-Strike: Ok, after 9 years of unlimited...",1,1,[1],"[1, 49, 36, 37, 8]"
9,9,10,Counter-Strike,Every server is spanish or french. I can now f...,1,0,[1],"[1, 49, 36, 37, 8]"


In [22]:
print("Total nunber of reviews:", len(dataset))
print("Number of reviews without genre:", number_of_reviews_without_genre)
print("Number of reviews without category:", number_of_reviews_without_category)
print('\n\n')
print("Number of reviews with genre:", len(dataset) - number_of_reviews_without_genre)
print("Number of reviews with category:", len(dataset) - number_of_reviews_without_category)

Total nunber of reviews: 4180148
Number of reviews without genre: 2157503
Number of reviews without category: 2157439



Number of reviews with genre: 2022645
Number of reviews with category: 2022709


In [20]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4180148 entries, 0 to 4180147
Data columns (total 8 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
 6   genre_id      object
 7   category_id   object
dtypes: int64(4), object(4)
memory usage: 255.1+ MB


In [18]:
# save the dataset with valid genre and category to a new pkl file


dataset_valid_genres = dataset[dataset['genre_id'].map(lambda x: len(x) > 0)]

# remove duplicate reviews by the same review in each game
unique_list = ['review_text']
dataset_valid_genres['review_text'] = dataset_valid_genres['review_text'].str.strip()
dataset_valid_genres = dataset_valid_genres.drop_duplicates(subset=unique_list, keep='first')

print(dataset_valid_genres.info(verbose=True))

save_path = Path(f'../topic_modelling/00_dataset_filtered_all_with_genre_{len(dataset_valid_genres)}.pkl').resolve()
dataset_valid_genres.to_pickle(save_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_valid_genres['review_text'] = dataset_valid_genres['review_text'].str.strip()


<class 'pandas.core.frame.DataFrame'>
Index: 1959922 entries, 0 to 4180147
Data columns (total 8 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
 6   genre_id      object
 7   category_id   object
dtypes: int64(4), object(4)
memory usage: 134.6+ MB
None


In [19]:
# save the dataset with valid category to a new pkl file

dataset_valid_categories = dataset[dataset['category_id'].map(lambda x: len(x) > 0)]

# remove duplicate reviews by the same review in each game
unique_list = ['review_text']
dataset_valid_categories['review_text'] = dataset_valid_categories['review_text'].str.strip()
dataset_valid_categories = dataset_valid_categories.drop_duplicates(subset=unique_list, keep='first')

print(dataset_valid_categories.info(verbose=True))

save_path = Path(f'../topic_modelling/00_dataset_filtered_all_with_category_{len(dataset_valid_categories)}.pkl').resolve()
dataset_valid_categories.to_pickle(save_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_valid_categories['review_text'] = dataset_valid_categories['review_text'].str.strip()


<class 'pandas.core.frame.DataFrame'>
Index: 1959977 entries, 0 to 4180147
Data columns (total 8 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
 6   genre_id      object
 7   category_id   object
dtypes: int64(4), object(4)
memory usage: 134.6+ MB
None


---

Group by genre id and category id and save reviews by top N genre/category.

In [12]:
# group by genre id and category id
# then count the number of reviews in each genre and category

a = pd.Series([item for sublist in dataset['genre_id'] for item in sublist])
genre_freq = a.value_counts().sort_index().rename_axis('genre_id').reset_index(name='f')
genre_freq['genre_id'] = genre_freq['genre_id'].astype('int8')

# merge with the genres_df to get the genre description
genre_freq = genre_freq.merge(genres_df, on='genre_id', how='left')

genre_freq.sort_values(by='f', ascending=False, inplace=True)
genre_freq.reset_index(drop=True, inplace=True)

# re-arrange the columns
genre_freq = genre_freq[['genre_id', 'description', 'f']]

genre_freq

Unnamed: 0,genre_id,description,f
0,1,Action,1314407
1,23,Indie,741913
2,25,Adventure,636492
3,3,RPG,545025
4,2,Strategy,409708
5,28,Simulation,260097
6,37,Free to Play,246372
7,4,Casual,209223
8,29,Massively Multiplayer,94777
9,9,Racing,25863


Since there can be duplicate reviews per genre (we only consider unique (genre, review_text) pair)

Create a df for each genre, then filter duplicate pair 

Then calculate the actual number of reviews within the genre

This ensures each review is viewed only once -> equal impact to the topic models

In [13]:
# top_11_genres = genre_freq.head(11)

genre_reviews_dflist = []

for n, (index, row) in enumerate(genre_freq.iterrows()):
    genre_id = row['genre_id']
    genre_description = row['description']
    print(f'Processing {genre_description}...')
    genre_reviews = dataset[dataset['genre_id'].map(lambda x: str(genre_id) in x)]

    genre_reviews_dflist.append(genre_reviews)
    

Processing Action...
Processing Indie...
Processing Adventure...
Processing RPG...
Processing Strategy...
Processing Simulation...
Processing Free to Play...
Processing Casual...
Processing Massively Multiplayer...
Processing Racing...
Processing Sports...
Processing Animation & Modeling...
Processing Video Production...
Processing Utilities...
Processing Design & Illustration...
Processing Gore...
Processing Nudity...
Processing Violent...
Processing Software Training...
Processing Education...
Processing Game Development...
Processing Web Publishing...
Processing Photo Editing...
Processing Audio Production...
Processing Early Access...
Processing Sexual Content...


In [31]:
# filter the reviews within the genre to contain only unique reviews content
# for topic modeling

unique_list = ['app_id','review_text']
# unique_list = ['review_text']


genre_reviews_dflist_cleaned = []

for df in genre_reviews_dflist:
    df_new = df.copy()
    df_new['review_text'] = df_new['review_text'].str.strip()
    df_new = df_new.drop_duplicates(subset=unique_list, keep='first')

    genre_reviews_dflist_cleaned.append(df_new)

In [32]:
# create df showing the number of reviews for each genre
genre_freq_updated = pd.DataFrame({'genre_id': [], 'description': [], 'f': []})

for n, df in enumerate(genre_reviews_dflist_cleaned):
    genre_freq_updated.loc[n] = [genre_freq.loc[n]['genre_id'], genre_freq.loc[n]['description'], len(df)]

genre_freq_updated

Unnamed: 0,genre_id,description,f
0,1,Action,1309869
1,23,Indie,740228
2,25,Adventure,634885
3,3,RPG,543638
4,2,Strategy,408375
5,28,Simulation,259407
6,37,Free to Play,244982
7,4,Casual,208868
8,29,Massively Multiplayer,94489
9,9,Racing,25843


In [33]:
# compare the number of reviews before and after cleaning

for n, (df, df_cleaned) in enumerate(zip(genre_reviews_dflist, genre_reviews_dflist_cleaned)):
    genre_description = genre_freq.iloc[n]['description']
    print(f'Number of reviews for {genre_description}: {len(df)}')
    print(f'Number of reviews for {genre_description} after cleaning: {len(df_cleaned)}')
    print(f'Number of reviews removed: {len(df) - len(df_cleaned)}')
    print()

Number of reviews for Action: 1314407
Number of reviews for Action after cleaning: 1309869
Number of reviews removed: 4538

Number of reviews for Indie: 741913
Number of reviews for Indie after cleaning: 740228
Number of reviews removed: 1685

Number of reviews for Adventure: 636492
Number of reviews for Adventure after cleaning: 634885
Number of reviews removed: 1607

Number of reviews for RPG: 545025
Number of reviews for RPG after cleaning: 543638
Number of reviews removed: 1387

Number of reviews for Strategy: 409708
Number of reviews for Strategy after cleaning: 408375
Number of reviews removed: 1333

Number of reviews for Simulation: 260097
Number of reviews for Simulation after cleaning: 259407
Number of reviews removed: 690

Number of reviews for Free to Play: 246372
Number of reviews for Free to Play after cleaning: 244982
Number of reviews removed: 1390

Number of reviews for Casual: 209223
Number of reviews for Casual after cleaning: 208868
Number of reviews removed: 355

Nu

In [34]:
# save the cleaned reviews to pickle files
# we only save the top 11 genres

top_11_genres = genre_freq_updated.head(11)

for n, ((index, row), genre_reviews) in enumerate(zip(top_11_genres.iterrows(), genre_reviews_dflist_cleaned)):

    genre_description = top_11_genres.iloc[n]['description']

    # string process on genre_description
    # convert them to all lowercase
    # replace space with underscore
    genre_description = genre_description.lower().replace(' ', '_')

    save_path = Path(f'../topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]/{n:02}_{genre_description}.pkl').resolve()
    if not save_path.parent.exists():
        save_path.parent.mkdir(parents=True)

    if save_path.exists():
        print(f'File {save_path} already exists, skipping...')
        continue
        
    genre_reviews.to_pickle(save_path)

    print(f'Saved {genre_description} to {save_path}')

File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[app_id,review_text]/00_action.pkl already exists, skipping...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[app_id,review_text]/01_indie.pkl already exists, skipping...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[app_id,review_text]/02_adventure.pkl already exists, skipping...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[app_id,review_text]/03_rpg.pkl already exists, skipping...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[app_id,review_text]/04_strategy.pkl already exists, skipping...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[app_id,review_text]/05_simulation.pkl already exists, skipping...
File /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[app_id,review_text]/06_free_to_play.pkl already exists, skipping...
Fi

In [35]:
# calculate number of unique comments included within the top 11 genres (b4 rm duplicates)

unique_comments = set()

for genre_review in genre_reviews_dflist:
    unique_comments.update(genre_review['index'].unique())

print(f'Total number of unique comments: {len(unique_comments)}')

Total number of unique comments: 2022645


In [36]:
# calculate number of unique comments of from a selected list of genres (b4 rm duplicates)

unique_comments = set()

sub_list_idx = [0, 1, 2, 7]     # selected genre

for genre_review in [genre_reviews_dflist[i] for i in sub_list_idx]:
    unique_comments.update(genre_review['index'].unique())

print(f'Total number of unique comments: {len(unique_comments)}')

Total number of unique comments: 1703792


In [37]:
# numbner of unique comments included within the top 11 genres (in cleaned df)

unique_comments = set()

for genre_review in genre_reviews_dflist_cleaned:
    unique_comments.update(genre_review['index'].unique())

print(f'Total number of unique comments: {len(unique_comments)}')

Total number of unique comments: 2016959


In [38]:
# number of unique comments of from a selected list of genres (in cleaned df)

unique_comments = set()

sub_list_idx = [0, 1, 2, 7]     # selected genre

for genre_review in [genre_reviews_dflist_cleaned[i] for i in sub_list_idx]:
    unique_comments.update(genre_review['index'].unique())

print(f'Total number of unique comments: {len(unique_comments)}')

Total number of unique comments: 1698598


In [None]:
# unique number of

Dataset Creation ends

---

In [11]:
# get the length of the genre list for each game and plot the distribution

dataset_copy = dataset.copy()

dataset_copy['genre_count'] = dataset_copy['genre_id'].map(lambda x: len(x))
dataset_copy['category_count'] = dataset_copy['category_id'].map(lambda x: len(x))

In [12]:
temp = dataset_copy.groupby('genre_count').count()['review_text'].reset_index().sort_values(by='genre_count',ascending=True)
temp.style.background_gradient(cmap='Purples')

Unnamed: 0,genre_count,review_text
0,0,2157503
1,1,641944
2,2,686657
3,3,399395
4,4,204480
5,5,47317
6,6,34558
7,7,8255
8,9,39


In [25]:
genre_reviews_dflist[7].head(10)

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id
18770,22105,10120,Aces of the Galaxy,Aces of the Galaxy is a classical rail shooter...,1,0,"[1, 4]",[2]
18771,22106,10120,Aces of the Galaxy,This game is a good old fashioned rail shooter...,1,0,"[1, 4]",[2]
18772,22107,10120,Aces of the Galaxy,Fun arcade type space battle game. Main story ...,1,0,"[1, 4]",[2]
18773,22108,10120,Aces of the Galaxy,A great little game that'll be of great joy to...,1,1,"[1, 4]",[2]
18774,22109,10120,Aces of the Galaxy,Aces of the Galaxy is a vibrant rail shooter. ...,0,0,"[1, 4]",[2]
18775,22110,10120,Aces of the Galaxy,This is the first On-Rail Shooter game i have ...,1,0,"[1, 4]",[2]
18776,22111,10120,Aces of the Galaxy,Decent-ish. Difficult to recommend one way or ...,1,0,"[1, 4]",[2]
18777,22112,10120,Aces of the Galaxy,Great arcade space-craft fighter shooty thingy...,1,0,"[1, 4]",[2]
18778,22113,10120,Aces of the Galaxy,running windows 7 w/a 360 controller and game ...,1,0,"[1, 4]",[2]
18779,22114,10120,Aces of the Galaxy,I cannot recommend this game - it just doesn't...,0,1,"[1, 4]",[2]


In [26]:
pd.read_pickle('../topic_modelling/top_11_genres/00_action.pkl')

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id
0,0,10,Counter-Strike,Ruined my life.,1,0,[1],"[1, 49, 36, 37, 8]"
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1,[1],"[1, 49, 36, 37, 8]"
2,2,10,Counter-Strike,This game saved my virginity.,1,0,[1],"[1, 49, 36, 37, 8]"
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0,[1],"[1, 49, 36, 37, 8]"
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1,[1],"[1, 49, 36, 37, 8]"
...,...,...,...,...,...,...,...,...
4179603,6416379,99900,Spiral Knights,Cool swingy sharp things.,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179605,6416381,99900,Spiral Knights,This game use to be good..until they did the r...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"


In [49]:
dataset['genre_id'].apply(lambda x: '1' in list(x))

0           True
1           True
2           True
3           True
4           True
           ...  
4891923    False
4891924    False
4891925    False
4891926    False
4891927    False
Name: genre_id, Length: 4180148, dtype: bool