This notebook is designed to complete two tasks, which are get the counts of the 10 most dominant developers of all gaming apps and get the counts of the 10 most dominant developers of the 100 most popular gaming apps.

After acquiring the two cvs files 'df_developercount_all.csv' and 'df_developercount_100_most', they can then be used by Josh to plot images of counts vs developers.

In [1]:
def convertJLtoDataFrame(fname='products_all.jl', key_list_used=['all_ratio','developer','genres']):
    '''
    Convert the raw data in .jl format to a DataFrame
    
    :fname: str name of a .jl file with data (defaults to 'products_all.jl' which is our data)
    
    :key_list_used: a list of strings that are the name of features in the data 
    (Defaults to just three features 'all_ratio, platform, genres' as a demo for our analysis)
    '''
    
    try:
        assert isinstance(fname, str) and '.jl' in fname  
    except AssertionError:
        print("fname is not a valid str name of a .jl file containing data!")
        
    try:
        assert isinstance(key_list_used, list)
    except AssertionError:
        print("key_list_used is not a valid list of genres")
        
    # Make the data into a list of all strings
    with open(fname,'r',encoding='utf8') as f:
        mylist = [line.rstrip('\n') for line in f]

    data_length = len(mylist)
    data_list = []

    # Convert the strings to dictionaries
    for i in mylist:
        d=json.loads(i)
        data_list.append(d)
        
    list_keys = list(data_list[0].keys())

    value_list_used = []
    key_list = []
    
    for i in key_list_used:
        key_list.append(i)
        value_list_used.append([])

    for i in data_list:
        # features w/o any values return None
        index = 0
        for key in key_list:
            value_list_used[index].append(i.get(key))
            index += 1

    # value_list_used = [value_list1,value_list2,value_list3]
    dic_used = dict(zip(key_list_used,value_list_used))

    # Convert to DataFrame
    df_used = pd.DataFrame(dic_used)
    
    # Return dataframe of the data from the .jl file
    return df_used

In [2]:
def parseRatioAndReviews(df):
    '''
    return a DataFrame with gaming apps which have an all_ratio
    '''
    
    list_all_ratio=[]
    list_N_reviews=[]

    df_with_all_ratio=df[df.all_ratio.str.contains('positive')]
    
    assert isinstance(df_with_all_ratio, pd.DataFrame)
    
    for i in df_with_all_ratio.all_ratio:
        list_all_ratio.append(i.split()[0])
        list_N_reviews.append(i.split()[3])

    return df_with_all_ratio, list_all_ratio

In [3]:
def parseDeveloper_all(df_with_all_ratio):
    '''
    A function to parse our all_ratio dataframe to Developer
    :df_with_all_ratio: a dataframe obj of Developers and their all_ratios
    '''
    assert isinstance(df_with_all_ratio, pd.DataFrame)
    
    list_all_Developer=[]
    
    
    
    for i in df_with_all_ratio.developer:
        if type(i)==str:
            index1=i.find('Publisher')
            index2=i.find('Release')
            if index1!=-1:
                list_all_Developer.append(i[:index1])
                
            elif index2!=-1:
                list_all_Developer.append(i[:index2])
        else:
            list_all_Developer.append(None)

    d = Counter(list_all_Developer)
    
    list_word=[]
    list_count=[]
    
    for word, count in d.most_common(10):
        
        list_word.append(word)
        list_count.append(count)
    
    dic_wordcount={'word':list_word,'count':list_count}
    df_wordcount=pd.DataFrame.from_dict(dic_wordcount)
    
    return df_wordcount

In [4]:
def parseDeveloper_100(df_all_apps,fname='top_100_file.csv'):
    '''A function to parse the DataFrame inside 'top_100_file.csv' file to genres
    :df_with_all_ratio: a dataframe obj of genres and their all_ratios'''
    
    try:
        assert isinstance(fname, str) and '.csv' in fname  
    except AssertionError:
        print("fname is not a valid str name of a .csv file containing data!")
    
    assert isinstance(df_all_apps, pd.DataFrame)    
    
    df_most_100=pd.read_csv(fname)
    list_all_Developer=[]
    list_app_name=[]
    
    for i in df_most_100.Game:
        if i in list(df_all_apps.app_name):
            if df_all_apps.developer[list(df_all_apps.app_name).index(i)]!=None:
                D_name=df_all_apps.developer[list(df_all_apps.app_name).index(i)]
                if type(D_name)==str:
                    index1=D_name.find('Publisher')
                    index2=D_name.find('Release')
                    if index1!=-1:
                        list_all_Developer.append(D_name[:index1])
                    elif index2!=-1:
                        list_all_Developer.append(D_name[:index2])
                else:
                    list_all_Developer.append(None)
                list_app_name.append(i)
    
    dic_used={'app_name':list_app_name,'developer':list_all_Developer}
    df_genres_100_most=pd.DataFrame.from_dict(dic_used)
    
    list_all_Developer=list(df_genres_100_most.developer)

    d = Counter(list_all_Developer)
    
    list_word=[]
    list_count=[]
    
    for word, count in d.most_common(10):
        
        list_word.append(word)
        list_count.append(count)
    
    dic_wordcount={'word':list_word,'count':list_count}
    df_wordcount=pd.DataFrame.from_dict(dic_wordcount)
    return df_wordcount

In [5]:
import pandas as pd
import json
from IPython.display import display, HTML
from collections import Counter

def main():
    df_used = convertJLtoDataFrame()
    
    # Assert that a pandas DataFrame was returned
    assert isinstance(df_used, pd.DataFrame)
    
    print('DataFrame of data parsed from products_all.csv')
    df_used.to_csv('products_developer_counts.csv')
    display(df_used)
    
    df_with_all_ratio, list_all_ratio = parseRatioAndReviews(df_used)
    
    df_developercount_all = parseDeveloper_all(df_with_all_ratio)
    df_developercount_all.to_csv('df_developercount_all.csv')
    display(df_developercount_all)
    
    df_all_apps=convertJLtoDataFrame(key_list_used=['app_name','developer','genres'])
    df_developercount_100_most=parseDeveloper_100(df_all_apps)
    df_developercount_100_most.to_csv('df_developercount_100_most.csv')
    display(df_developercount_100_most)

In [6]:
if __name__ == "__main__":
    main()

DataFrame of data parsed from products_all.csv


Unnamed: 0,all_ratio,developer,genres
0,No user reviews,Daily Magic ProductionsPublisher:Big Fish Game...,"[Adventure, Casual]"
1,78% of the 19 user reviews for this game are p...,Pixelz GamesPublisher:CrytivoRelease Date: Apr...,"[Indie, Simulation, Strategy]"
2,No user reviews,EalsoftPublisher:EalsoftRelease Date: FALL 2018,"[Adventure, Casual, Indie]"
3,Need more user reviews to generate a score,,
4,No user reviews,Mojiken StudioPublisher:Toge ProductionsReleas...,"[Adventure, Casual, Indie]"
5,No user reviews,"Magic Pixel Kft.Release Date: Nov 9, 2018","[Action, Casual, Free to Play, Indie, Simulati..."
6,No user reviews,"Magic Pixel Kft.Release Date: Nov 9, 2018","[Action, Casual, Free to Play, Indie, Simulati..."
7,No user reviews,Nyan_FortPublisher:Nyan_FortRelease Date: Nov ...,"[Adventure, Indie]"
8,No user reviews,"Eren Aydin, Serhat YucekayaPublisher:Eren Aydi...","[Action, Adventure, Casual]"
9,No user reviews,Platonic Game StudioPublisher:Something2Releas...,"[Adventure, Indie, RPG]"


Unnamed: 0,word,count
0,,1570
1,Dovetail Games,160
2,Paradox Development Studio,107
3,SCS Software,73
4,"KOEI TECMO GAMES CO., LTD.",62
5,Choice of Games,49
6,Milestone S.r.l.,35
7,Arc System Works,35
8,Rebellion,34
9,Haemimont Games,30


Unnamed: 0,word,count
0,Paradox Development Studio,4
1,Sports Interactive,2
2,Visual Concepts,2
3,Klei Entertainment,2
4,Valve,2
5,Wargaming Group Limited,2
6,Konami Digital Entertainment,2
7,PUBG Corporation,1
8,"Studio Wildcard, Instinct Games, Efecto Studio...",1
9,"CAPCOM Co., Ltd.",1
