In [7]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn
from sklearn import metrics
import re

In [8]:
# Data source: https://www.kaggle.com/datasets/gauthamp10/google-playstore-apps
path = os.getcwd() + "/data/Google-Playstore.csv"
df = pd.read_csv(path)

In [9]:
df = df.drop(['Scraped Time', 'Released', 'Developer Id', 'Installs', 'Currency', 'Minimum Installs',  'Size',
    'Developer Website', 'Developer Email', 'Privacy Policy', 'App Name', 'Minimum Android', 'Content Rating', 
    'Editors Choice'], axis=1)

In [10]:
cat_name = 'Super Category'
df[cat_name] = df['Category']
df[cat_name] = df[cat_name].replace(['Adventure', 'Racing', 'Puzzle', 'Entertainment', 'Arcade', 'Photography', 'Sports', 'Card', 'Trivia', 'Strategy', 'Action', 'Simulation', 'Casino', 'Comics'], 'Entertainment')
df[cat_name] = df[cat_name].replace(['Communication', 'Social', 'Events', 'Dating', 'Role Playing', 'Lifestyle', 'Personalization', 'Medical', 'Health & Fitness', 'Beauty', 'Parenting'], 'Personal & Social')
df[cat_name] = df[cat_name].replace(['Productivity', 'Books & Reference', 'Education', 'Business', 'Educational', 'Finance'], 'Development')
df[cat_name] = df[cat_name].replace(['Casual', 'Tools', 'Libraries & Demo', 'Maps & Navigation', 'Travel & Local', 'Food & Drink', 'Music', 'Auto & Vehicles','Shopping', 'Board', 'Music & Audio', 'News & Magazines', 'Art & Design', 'House & Home', 'Weather', 'Word', 'Video Players & Editors'], 'Other')
df[cat_name] = df[cat_name].astype('category')
df[cat_name].value_counts()

Other                736774
Development          668061
Personal & Social    460998
Entertainment        447111
Name: Super Category, dtype: int64

In [11]:
df['Rating Bin'] = pd.cut(df['Rating'],bins=[0,3.8,4.4,5], labels=[0,1,2])
# labels=['Bad App Yo', 'Moderate', 'Superb']

In [12]:
date_of_dataset = 2021
df['Time since last update'] = pd.to_datetime(df['Last Updated']).apply(lambda date: date_of_dataset - int(date.year))

In [13]:
# df = df.drop(['Last Updated', 'Rating', 'Category'], axis=1)

In [14]:
df.isnull().sum()

App Id                    0
Category                  0
Rating                22883
Rating Count          22883
Maximum Installs          0
Free                      0
Price                     0
Last Updated              0
Ad Supported              0
In App Purchases          0
Super Category            0
Rating Bin          1082645
dtype: int64

In [15]:
df = df.dropna()

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1230299 entries, 1 to 2312943
Data columns (total 12 columns):
 #   Column            Non-Null Count    Dtype   
---  ------            --------------    -----   
 0   App Id            1230299 non-null  object  
 1   Category          1230299 non-null  object  
 2   Rating            1230299 non-null  float64 
 3   Rating Count      1230299 non-null  float64 
 4   Maximum Installs  1230299 non-null  int64   
 5   Free              1230299 non-null  bool    
 6   Price             1230299 non-null  float64 
 7   Last Updated      1230299 non-null  object  
 8   Ad Supported      1230299 non-null  bool    
 9   In App Purchases  1230299 non-null  bool    
 10  Super Category    1230299 non-null  category
 11  Rating Bin        1230299 non-null  category
dtypes: bool(3), category(2), float64(3), int64(1), object(3)
memory usage: 81.0+ MB


In [17]:
# minder dan 100.000 installs en minstens 10 ratings
df = df[(df['Maximum Installs'] < 100000) & (df['Rating Count'] >= 10)]

In [18]:
df['Rating Bin'].value_counts()

1    292342
2    280900
0    232780
Name: Rating Bin, dtype: int64

**Code below is still in development**

In [19]:
def prepare_inputs(encoder_type, feature):
    try:
        ohe = encoder_type
        return df.join(pd.DataFrame(data=ohe.fit_transform(df[[feature]]).toarray(), dtype=np.int8, 
            columns=list(df[feature].unique())))
    except:
        print("Get right column")

# reset index to be able to perform the join
df = df.reset_index(drop=True)

# one hot encoder
# df = prepare_inputs(sklearn.preprocessing.OneHotEncoder(), 'Rating Bin')
df = prepare_inputs(sklearn.preprocessing.OneHotEncoder(), 'Super Category')

In [20]:
df.columns

Index(['App Id', 'Category', 'Rating', 'Rating Count', 'Maximum Installs',
       'Free', 'Price', 'Last Updated', 'Ad Supported', 'In App Purchases',
       'Super Category', 'Rating Bin', 'Other', 'Personal & Social',
       'Entertainment', 'Development'],
      dtype='object')

In [21]:
df = df.drop(['Last Updated', 'Rating', 'Category', 'Super Category', 'Other'], axis=1)
# 'Rating Bin'

In [22]:
df.head()

Unnamed: 0,App Id,Rating Count,Maximum Installs,Free,Price,Ad Supported,In App Purchases,Rating Bin,Personal & Social,Entertainment,Development
0,com.webserveis.batteryinfo,64.0,7662,True,0.0,True,False,1,0,1,0
1,getfreedata.superfatiza.unlimitedjiodataprank,12.0,2567,True,0.0,True,False,2,0,1,0
2,com.mozaix.simoneboard,39.0,702,True,0.0,False,False,0,0,0,1
3,com.ikeyboard.theme.neon_3d.iron.tech,820.0,62433,True,0.0,True,False,2,0,0,1
4,com.MrScratchEnterprises.CarDogeGame,55.0,329,True,0.0,False,False,2,1,0,0


In [23]:
df['Rating Count'] = df['Rating Count'].astype('int')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806022 entries, 0 to 806021
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   App Id             806022 non-null  object  
 1   Rating Count       806022 non-null  int32   
 2   Maximum Installs   806022 non-null  int64   
 3   Free               806022 non-null  bool    
 4   Price              806022 non-null  float64 
 5   Ad Supported       806022 non-null  bool    
 6   In App Purchases   806022 non-null  bool    
 7   Rating Bin         806022 non-null  category
 8   Personal & Social  806022 non-null  int8    
 9   Entertainment      806022 non-null  int8    
 10  Development        806022 non-null  int8    
dtypes: bool(3), category(1), float64(1), int32(1), int64(1), int8(3), object(1)
memory usage: 26.9+ MB


In [25]:
# df.to_parquet('data\Google-Playstore-Modified.parquet')