In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn
from sklearn import metrics
import re

In [None]:
#Data source: https://www.kaggle.com/datasets/gauthamp10/google-playstore-apps
data_folder = os.getcwd() +"/../data/"
csv_name = "Google-Playstore.csv"
data_path = data_folder + csv_name

In [None]:
df = pd.read_csv(data_path)

#Feature engineering (eng) starts
eng_df = df.drop(['Installs', 'Currency', 'Minimum Installs', 'Developer Website', 'Developer Email', 'Privacy Policy', 'App Name'], axis=1)

# pak een n sample vd dataframe
eng_df_s = eng_df.sample(n=10000, random_state=123) #small
eng_df_t = eng_df.sample(n=1000, random_state=123) #tiny
eng_df_tt = eng_df.sample(n=200, random_state=123) #teeny-tiny

#Select one for further use
eng_df = eng_df_t

In [None]:
#What kind of Size values exist, going by the text part of the string
size_cats = eng_df_s['Size'].apply(lambda x: re.findall('([\sA-Za-z]*$)', str(x))[0])
print(size_cats.value_counts())

#Knowing there are three kinds of known sizes (kb, Mb, Gb) we convert these to Mb
#But because there are also missing values or OS dependent sizes, we store this categorically in an extra column
#(known sizes will be categorized as 'known')
def process_Size(value):
    value = str(value)
    conversion = {'k':0.001, 'M':1.0, 'G':1000.0}
    if value[-1] in conversion.keys():
        value = value.replace(',', '') #added because some 
        size = float(value[:-1]) * conversion[value[-1]]
        category = 'Known'
    else:
        size = None
        category = value
    return (size, category)


temp_df = pd.DataFrame(eng_df['Size'].apply(lambda x: process_Size(x)).tolist(), columns=['Size', 'Size_category'])
eng_df['Size'] = temp_df['Size'].values
eng_df['Size_category'] = temp_df['Size_category'].values
#del temp_df ##bugged, but I'd prefer to delete

In [None]:
#Separate quantitative columns into a quantitative dataframe
q_df = eng_df[['Rating', 'Rating Count', 'Maximum Installs', 'Price', 'Size']]
standard_deviations = 3
q_df = q_df[q_df.apply(lambda x: np.abs(x - x.mean()) / x.std() < standard_deviations)
   .all(axis=1)]

#Log-transform
#Needs to be edited so it's only applied to some columns (e.g. installs but not rating)
transf_q_df = q_df.apply(lambda x: np.log10(x, where=0<x))
sns.pairplot(transf_q_df, kind="kde")
#One by one alternative: sns.distplot(col1, col2)

In [None]:
#Only run this with sufficiently small sample
sns.pairplot(q_df)