This notebook is designed to complete one task, which is to get a DataFrame with all_ratio, developer as columns.

After acquiring the two cvs files 'df_all_ratio_vs_developer.csv', it can then be used by Josh to plot images of all_ratio vs developer.

In [1]:
def convertJLtoDataFrame(fname='products_all.jl', key_list_used=['all_ratio','developer','genres']):
    '''
    Convert the raw data in .jl format to a DataFrame
    
    :fname: str name of a .jl file with data (defaults to 'products_all.jl' which is our data)
    
    :key_list_used: a list of strings that are the name of features in the data 
    (Defaults to just three features 'all_ratio, platform, genres' as a demo for our analysis)
    '''
    
    try:
        assert isinstance(fname, str) and '.jl' in fname  
    except AssertionError:
        print("fname is not a valid str name of a .jl file containing data!")
        
    try:
        assert isinstance(key_list_used, list)
    except AssertionError:
        print("key_list_used is not a valid list of genres")
        
    # Make the data into a list of all strings
    with open(fname,'r',encoding='utf8') as f:
        mylist = [line.rstrip('\n') for line in f]

    data_length = len(mylist)
    data_list = []

    # Convert the strings to dictionaries
    for i in mylist:
        d=json.loads(i)
        data_list.append(d)
        
    list_keys = list(data_list[0].keys())

    value_list_used = []
    key_list = []
    
    for i in key_list_used:
        key_list.append(i)
        value_list_used.append([])

    for i in data_list:
        # features w/o any values return None
        index = 0
        for key in key_list:
            value_list_used[index].append(i.get(key))
            index += 1

    # value_list_used = [value_list1,value_list2,value_list3]
    dic_used = dict(zip(key_list_used,value_list_used))

    # Convert to DataFrame
    df_used = pd.DataFrame(dic_used)
    
    # Return dataframe of the data from the .jl file
    return df_used

In [2]:
def parseRatioAndReviews(df):
    '''
    return a DataFrame with gaming apps which have an all_ratio
    '''
    

    df_with_all_ratio=df[df.all_ratio.str.contains('positive')]
    
    assert isinstance(df_with_all_ratio, pd.DataFrame)

    return df_with_all_ratio

In [3]:
def parseRatio_NreviewsAndDeveloper(df_with_all_ratio,fname='df_wordcount_100_most.csv'):
    '''return a DataFrame containing all_ratio 
    and developer only for the top 10'''
    
    assert isinstance(df_with_all_ratio, pd.DataFrame)
    
    list_all_ratio=[]
    list_N_reviews=[]
    
    for i in df_with_all_ratio.all_ratio:
        list_all_ratio.append(i.split()[0])
        list_N_reviews.append(i.split()[3])
    
    list_all_Developer=[]
    
    for i in df_with_all_ratio.developer:
        if type(i)==str:
            index1=i.find('Publisher')
            index2=i.find('Release')
            if index1!=-1:
                list_all_Developer.append(i[:index1])
                
            elif index2!=-1:
                list_all_Developer.append(i[:index2])
        else:
            list_all_Developer.append(None)
            
    dic_all_ratio_developer={'all_ratio':list_all_ratio,'developer':list_all_Developer}
    dic_Nreviews_developer={'#Reviews':list_N_reviews,'developer':list_all_Developer}
    df_all_ratio_developer=pd.DataFrame(dic_all_ratio_developer)
    df_Nreviews_developer=pd.DataFrame(dic_Nreviews_developer)
    return df_all_ratio_developer
    

In [4]:
import pandas as pd
import json
from IPython.display import display, HTML
from collections import Counter

def main():
    df_used = convertJLtoDataFrame()
    
    # Assert that a pandas DataFrame was returned
    assert isinstance(df_used, pd.DataFrame)
    
    print('DataFrame of data parsed from products_all.csv')
    df_used.to_csv('products_developer_counts.csv')
    display(df_used)
    
    df_with_all_ratio= parseRatioAndReviews(df_used)
    
    df_all_ratio_developer= parseRatio_NreviewsAndDeveloper(df_with_all_ratio)
    df_all_ratio_developer.to_csv('df_all_ratio_vs_developer.csv')
    
    display(df_all_ratio_developer)
    

In [5]:
if __name__ == "__main__":
    main()

DataFrame of data parsed from products_all.csv


Unnamed: 0,all_ratio,developer,genres
0,No user reviews,Daily Magic ProductionsPublisher:Big Fish Game...,"[Adventure, Casual]"
1,78% of the 19 user reviews for this game are p...,Pixelz GamesPublisher:CrytivoRelease Date: Apr...,"[Indie, Simulation, Strategy]"
2,No user reviews,EalsoftPublisher:EalsoftRelease Date: FALL 2018,"[Adventure, Casual, Indie]"
3,Need more user reviews to generate a score,,
4,No user reviews,Mojiken StudioPublisher:Toge ProductionsReleas...,"[Adventure, Casual, Indie]"
5,No user reviews,"Magic Pixel Kft.Release Date: Nov 9, 2018","[Action, Casual, Free to Play, Indie, Simulati..."
6,No user reviews,"Magic Pixel Kft.Release Date: Nov 9, 2018","[Action, Casual, Free to Play, Indie, Simulati..."
7,No user reviews,Nyan_FortPublisher:Nyan_FortRelease Date: Nov ...,"[Adventure, Indie]"
8,No user reviews,"Eren Aydin, Serhat YucekayaPublisher:Eren Aydi...","[Action, Adventure, Casual]"
9,No user reviews,Platonic Game StudioPublisher:Something2Releas...,"[Adventure, Indie, RPG]"


Unnamed: 0,all_ratio,developer
0,78%,Pixelz Games
1,83%,Outerlight Ltd.
2,79%,Egosoft
3,73%,Egosoft
4,57%,Strategy First
5,90%,Introversion Software
6,75%,Techland
7,100%,"PopCap Games, Inc."
8,85%,"PopCap Games, Inc."
9,77%,"PopCap Games, Inc."
