This notebook is designed to complete two tasks, which are get the counts of the 10 most prevalent genres of all gaming apps and get the counts of the 10 most prevalent genres of the 100 most popular gaming apps.

After acquiring the two cvs files 'df_wordcount_all.csv' and 'df_wordcount_100_most', they can then be used by Josh to plot images of counts vs genres.

In [1]:
def convertJLtoDataFrame(fname='products_all.jl', key_list_used=['all_ratio','platform','genres']):
    '''
    Convert the raw data in .jl format to a DataFrame
    
    :fname: str name of a .jl file with data (defaults to 'products_all.jl' which is our data)
    
    :key_list_used: a list of strings that are the name of features in the data 
    (Defaults to just three features 'all_ratio, platform, genres' as a demo for our analysis)
    '''
    
    try:
        assert isinstance(fname, str) and '.jl' in fname  
    except AssertionError:
        print("fname is not a valid str name of a .jl file containing data!")
        
    try:
        assert isinstance(key_list_used, list)
    except AssertionError:
        print("key_list_used is not a valid list of genres")
        
    # Make the data into a list of all strings
    with open(fname,'r',encoding='utf8') as f:
        mylist = [line.rstrip('\n') for line in f]

    data_length = len(mylist)
    data_list = []

    # Convert the strings to dictionaries
    for i in mylist:
        d=json.loads(i)
        data_list.append(d)
        
    list_keys = list(data_list[0].keys())

    value_list_used = []
    key_list = []
    
    for i in key_list_used:
        key_list.append(i)
        value_list_used.append([])

    for i in data_list:
        # features w/o any values return None
        index = 0
        for key in key_list:
            value_list_used[index].append(i.get(key))
            index += 1

    # value_list_used = [value_list1,value_list2,value_list3]
    dic_used = dict(zip(key_list_used,value_list_used))

    # Convert to DataFrame
    df_used = pd.DataFrame(dic_used)
    
    # Return dataframe of the data from the .jl file
    return df_used

In [2]:
def parseRatioAndReviews(df):
    '''
    return a DataFrame with gaming apps which have an all_ratio
    '''
    
    list_all_ratio=[]
    list_N_reviews=[]

    df_with_all_ratio=df[df.all_ratio.str.contains('positive')]
    
    assert isinstance(df_with_all_ratio, pd.DataFrame)
    
    for i in df_with_all_ratio.all_ratio:
        list_all_ratio.append(i.split()[0])
        list_N_reviews.append(i.split()[3])

    return df_with_all_ratio, list_all_ratio

In [3]:
def parseGenres_all(df_with_all_ratio):
    '''
    A function to parse our all_ratio dataframe to genres
    :df_with_all_ratio: a dataframe obj of genres and their all_ratios
    '''
    assert isinstance(df_with_all_ratio, pd.DataFrame)
    
    list_all_genres=[]

    for i in df_with_all_ratio.genres:
        if type(i)==list:
            for j in i:
                list_all_genres.append(j)
        else:
            list_all_genres.append(None)

    d = Counter(list_all_genres)
    
    list_word=[]
    list_count=[]
    
    for word, count in d.most_common(10):
        
        list_word.append(word)
        list_count.append(count)
    
    dic_wordcount={'word':list_word,'count':list_count}
    df_wordcount=pd.DataFrame.from_dict(dic_wordcount)
    
    return df_wordcount

In [4]:
def parseGenres_100(df_all_apps,fname='top_100_file.csv'):
    '''A function to parse the DataFrame inside 'top_100_file.csv' file to genres
    :df_with_all_ratio: a dataframe obj of genres and their all_ratios'''
    
    try:
        assert isinstance(fname, str) and '.csv' in fname  
    except AssertionError:
        print("fname is not a valid str name of a .csv file containing data!")
    
    assert isinstance(df_all_apps, pd.DataFrame)    
    
    df_most_100=pd.read_csv('top_100_file.csv')
    list_genres_most_100=[]
    list_app_name=[]
    
    for i in df_most_100.Game:
        if i in list(df_all_apps.app_name):
            if df_all_apps.genres[list(df_all_apps.app_name).index(i)]!=None:
                list_genres_most_100.append(df_all_apps.genres[list(df_all_apps.app_name).index(i)])
                list_app_name.append(i)
    
    dic_used={'app_name':list_app_name,'genre':list_genres_most_100}
    df_genres_100_most=pd.DataFrame.from_dict(dic_used)

    list_all_genres_100=[]

    for i in df_genres_100_most.genre:
        if type(i)==list:
            for j in i:
                list_all_genres_100.append(j)
        else:
            list_all_genres_100.append(None)

    d = Counter(list_all_genres_100)
    
    list_word=[]
    list_count=[]
    
    for word, count in d.most_common(10):
        
        list_word.append(word)
        list_count.append(count)
    
    dic_wordcount={'word':list_word,'count':list_count}
    df_wordcount=pd.DataFrame.from_dict(dic_wordcount)
    return df_wordcount

In [5]:
import pandas as pd
import json
from IPython.display import display, HTML
from collections import Counter

def main():
    df_used = convertJLtoDataFrame()
    
    # Assert that a pandas DataFrame was returned
    assert isinstance(df_used, pd.DataFrame)
    
    print('DataFrame of data parsed from products_all.csv')
    df_used.to_csv('products_all.csv')
    display(df_used)
    
    df_with_all_ratio, list_all_ratio = parseRatioAndReviews(df_used)
    
    df_wordcount_all = parseGenres_all(df_with_all_ratio)
    df_wordcount_all.to_csv('df_wordcount_all.csv')
    display(df_wordcount_all)
    
    df_all_apps=convertJLtoDataFrame(key_list_used=['app_name','genres'])
    df_wordcount_100_most=parseGenres_100(df_all_apps)
    df_wordcount_100_most.to_csv('df_wordcount_100_most.csv')
    display(df_wordcount_100_most)

In [6]:
if __name__ == "__main__":
    main()

DataFrame of data parsed from products_all.csv


Unnamed: 0,all_ratio,platform,genres
0,No user reviews,"Windows,","[Adventure, Casual]"
1,78% of the 19 user reviews for this game are p...,"Windows,","[Indie, Simulation, Strategy]"
2,No user reviews,"Mac,Windows,","[Adventure, Casual, Indie]"
3,Need more user reviews to generate a score,"Windows,",
4,No user reviews,"Mac,Windows,","[Adventure, Casual, Indie]"
5,No user reviews,"SteamOS,Linux,Mac,Windows,","[Action, Casual, Free to Play, Indie, Simulati..."
6,No user reviews,"SteamOS,Linux,Mac,Windows,","[Action, Casual, Free to Play, Indie, Simulati..."
7,No user reviews,"Windows,","[Adventure, Indie]"
8,No user reviews,"Windows,","[Action, Adventure, Casual]"
9,No user reviews,"Windows,","[Adventure, Indie, RPG]"


Unnamed: 0,word,count
0,Indie,9770
1,Action,6211
2,Adventure,5328
3,Casual,4298
4,Strategy,3470
5,Simulation,3399
6,RPG,2789
7,,1529
8,Free to Play,1288
9,Early Access,1156


Unnamed: 0,word,count
0,Action,21
1,Simulation,20
2,Indie,16
3,Free to Play,14
4,Adventure,13
5,Massively Multiplayer,12
6,Strategy,12
7,RPG,10
8,Early Access,9
9,Sports,6
