### Import Libraries

In [2]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from rake_nltk import Rake

In [3]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [4]:
df = pd.read_csv("data/clean_appstore_games.csv")
df.head(3)

Unnamed: 0,url,id,name,subtitle,icon_url,average_user_rating,user_rating_count,price,in-app_purchases,description,developer,age_rating,languages,size,primary_genre,genres,original_release_date,current_version_release_date
0,https://apps.apple.com/us/app/sudoku/id284921427,284921427,Sudoku,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,4.0,3553.0,2.99,0.0,"Join over 21,000,000 of our fans and download ...",Mighty Mighty Good Games,4+,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",15853568.0,Games,"Games, Strategy, Puzzle",11/07/2008,30/05/2017
1,https://apps.apple.com/us/app/reversi/id284926400,284926400,Reversi,,https://is4-ssl.mzstatic.com/image/thumb/Purpl...,3.5,284.0,1.99,0.0,"The classic game of Reversi, also known as Oth...",Kiss The Machine,4+,EN,12328960.0,Games,"Games, Strategy, Board",11/07/2008,17/05/2018
2,https://apps.apple.com/us/app/morocco/id284946595,284946595,Morocco,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,3.0,8376.0,0.0,0.0,Play the classic strategy game Othello (also k...,Bayou Games,4+,EN,674816.0,Games,"Games, Board, Strategy",11/07/2008,5/09/2017


In [5]:
df.columns

Index(['url', 'id', 'name', 'subtitle', 'icon_url', 'average_user_rating',
       'user_rating_count', 'price', 'in-app_purchases', 'description',
       'developer', 'age_rating', 'languages', 'size', 'primary_genre',
       'genres', 'original_release_date', 'current_version_release_date'],
      dtype='object')

### Creating a combined text column
This combined text is created for future tokenizing and later used as the base of the app evaluator.

In [6]:
df["combo_text"] = df["name"] + " " + df["subtitle"] + " " + df["description"]

In [7]:
df.sample(3)["combo_text"]

3232     Dominion Card Randomiser   "Dominion is a deck...
6594     Farm Mania 3: Hot Vacation Cool village farmin...
10503    Backgammon Free with Friends: Online Live Game...
Name: combo_text, dtype: object

### Distiguish Free and Paid Apps

In [8]:
df["is_it_free"] = df["price"].map(lambda x: 1 if x > 0.0 else 0)

In [9]:
df.sample(5)[["price", "is_it_free"]]

Unnamed: 0,price,is_it_free
2097,0.0,0
15866,0.0,0
16410,0.0,0
10838,0.0,0
11959,0.0,0


### Create in-app puerchases sum column

In [10]:
def sum_in_app_purch(the_str):
    x = the_str.split(",")
    sum_i = 0
    for i in range(len(x)):
        sum_i += float(x[i])
    return sum_i

In [11]:
df["in-app_purch_sum"] = df["in-app_purchases"].map(sum_in_app_purch)

In [12]:
df.sample(5)[["in-app_purchases", "in-app_purch_sum"]]

Unnamed: 0,in-app_purchases,in-app_purch_sum
8710,0.00,0.0
5876,"0.99, 19.99, 4.99, 9.99",35.96
14175,0.00,0.0
7584,0.00,0.0
9123,"4.99, 2.99, 4.99, 2.99, 9.99, 4.99, 4.99, 2.99...",49.9


### Distinguish apps with/without in-app purchases

In [13]:
df["has_in-app_purch"] = df["in-app_purch_sum"].map(lambda x: 1 if x != 0.00 else 0)
df.sample(5)[["in-app_purch_sum", "has_in-app_purch"]]

Unnamed: 0,in-app_purch_sum,has_in-app_purch
3918,23.95,1
6435,0.0,0
7460,9.9,1
15555,2.99,1
7044,0.0,0


Finding the 3 apps that had a sum of 0 for in-app purchases but had a different value than just "0.00" to be corrected in "1. Exploring Data"

In [14]:
df.loc[(df["has_in-app_purch"] == 0) & (df["in-app_purchases"] != "0.00")]

Unnamed: 0,url,id,name,subtitle,icon_url,average_user_rating,user_rating_count,price,in-app_purchases,description,...,languages,size,primary_genre,genres,original_release_date,current_version_release_date,combo_text,is_it_free,in-app_purch_sum,has_in-app_purch


### Define whether an app is successful or not
The apps with 4 or more in average user rating will be classified as successful (1) and the ones lower as not succesfull (0)

In [15]:
df["successful_app"] = df["average_user_rating"].map(lambda x: 1 if x >= 4.0 else 0)

In [16]:
df.sample(5)[["average_user_rating", "successful_app"]]

Unnamed: 0,average_user_rating,successful_app
16769,0.0,0
6500,4.0,1
11020,5.0,1
8341,4.5,1
9305,3.5,0


### Combo Text to lowercase

In [17]:
def lowering(text):
    return text.lower()

In [18]:
df["combo_text"] = df["combo_text"].map(lowering)
df.sample(3)["combo_text"]

7267     unite lol   "connect with nearby league of leg...
16212    bubble shooter - baby angel new game bubble sh...
15243    merge bakery serve everyone tasty cakes! ranke...
Name: combo_text, dtype: object

### Tokenize

In [19]:
tokenizer = RegexpTokenizer(r'\w+')
def tokens(combo):
    return tokenizer.tokenize(combo)

In [20]:
df["tokens"] = df["combo_text"].map(tokens)
df.sample(5)[["combo_text", "tokens"]]

Unnamed: 0,combo_text,tokens
1320,jewel world skull edition amazing match 3 game...,"[jewel, world, skull, edition, amazing, match,..."
13788,math tile - aim 10240 aim: 10240 math tile is ...,"[math, tile, aim, 10240, aim, 10240, math, til..."
2930,"flippix travel - spain ""take a peek at our f...","[flippix, travel, spain, take, a, peek, at, ou..."
7933,"wordgrabtv ""if you're a fan of word games, y...","[wordgrabtv, if, you, re, a, fan, of, word, ga..."
13465,"animaze! ""bring rival animals together in ha...","[animaze, bring, rival, animals, together, in,..."


### Stemming

In [21]:
stemmer = PorterStemmer()
def stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

In [22]:
df["stems"] = df["tokens"].progress_map(stemming)
df.sample(5)[["tokens", "stems"]]

HBox(children=(IntProgress(value=0, max=16995), HTML(value='')))




Unnamed: 0,tokens,stems
4942,"[magic, cats, journey, arcade, match, 3, game,...","[magic, cat, journey, arcad, match, 3, game, a..."
10961,"[flappy, chains, free, tap, flap, fly, through...","[flappi, chain, free, tap, flap, fli, through,..."
13358,"[security, wall, construction, sim, security, ...","[secur, wall, construct, sim, secur, wall, is,..."
6584,"[eisblock, welt, herr, fisch, f, xe4ngt, die, ...","[eisblock, welt, herr, fisch, f, xe4ngt, die, ..."
9705,"[don, t, feed, the, fat, chicken, funny, game,...","[don, t, feed, the, fat, chicken, funni, game,..."


### Lemmatizing

In [23]:
lemmatizer = WordNetLemmatizer()
def lemmatizing(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [24]:
df["lems"] = df["tokens"].progress_map(lemmatizing)
df.sample(5)[["tokens", "lems"]]

HBox(children=(IntProgress(value=0, max=16995), HTML(value='')))




Unnamed: 0,tokens,lems
13723,"[heroes, of, magic, card, battle, do, you, lik...","[hero, of, magic, card, battle, do, you, like,..."
10568,"[clickipedia, clickipedia, wikipedia, race, ga...","[clickipedia, clickipedia, wikipedia, race, ga..."
15412,"[kwar, uff1a, u5947, u6a02, u661f, u969b, u623...","[kwar, uff1a, u5947, u6a02, u661f, u969b, u623..."
12990,"[10, dotz, logic, dot, puzzle, simple, relaxin...","[10, dotz, logic, dot, puzzle, simple, relaxin..."
1634,"[tik, tak, attack, there, are, four, in, a, ro...","[tik, tak, attack, there, are, four, in, a, ro..."


### Keyword Creation

In [25]:
df["keywords"] = ""

In [26]:
def keyword_func(combo_text):
    r = Rake()
    r.extract_keywords_from_text(combo_text)
    key_words_dict = r.get_word_degrees()
    return list(key_words_dict.keys())

In [27]:
df["keywords"] = df["combo_text"].progress_map(keyword_func)

HBox(children=(IntProgress(value=0, max=16995), HTML(value='')))




https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iterrows.html \
https://pypi.org/project/rake-nltk/

### Category List Creation

In [28]:
def create_cat_list(genres):
    categ = genres.split(",")
    return categ

In [29]:
df["categories"] = df["genres"].map(create_cat_list)
df.sample(5)[["genres", "categories"]]

Unnamed: 0,genres,categories
12352,"Games, Strategy","[Games, Strategy]"
14876,"Games, Strategy, Entertainment, Action","[Games, Strategy, Entertainment, Action]"
2299,"Games, Entertainment, Strategy, Adventure","[Games, Entertainment, Strategy, Adventure]"
2715,"Games, Adventure, Entertainment, Strategy","[Games, Adventure, Entertainment, Strategy]"
3163,"Games, Casual, Strategy, Entertainment","[Games, Casual, Strategy, Entertainment]"


### Save Dataframe to CSV file

In [30]:
df.to_csv("data/features_appstore_games.csv", index = False)