In [95]:
import pandas as pd # import for dataframe handle
from bs4 import BeautifulSoup
import requests
import math

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Functions to use in the cleaning process

In [96]:
def get_name(url):
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    try:
        return soup.find('h2', class_='pageheader').text.strip()
    except AttributeError:
        try:
            return soup.find('div', class_='apphub_AppName').text.strip()
        except AttributeError:
            return float('nan')

def get_tags(url):
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    try:
        tags = soup.find_all('a', class_='app_tag')
        return ' '.join([tags[i].text.strip() for i in range(len(tags))])
    except AttributeError:
        return float('nan')


def get_genre(url):
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    try:
        return soup.find('div', class_='lable').text.strip()
    except AttributeError:
        return float('nan')


def get_dev(url):
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    try:
        return soup.find('div', class_='dev_row').text.strip().split('\n')[2]
    except AttributeError:
        return float('nan')


def get_review(url):
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    try:
        return soup.find('span', class_='game_review_summary').text.strip().split('\n')[0]
    except AttributeError:
        return float('nan')


def get_lang(url):
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    try:
        return soup.find('span', class_='language_list').text.strip().split(',')
    except AttributeError:
        return float('nan')


def operating_system(requirements):
    macos = [ 'macos','osx','os x','mac']
    linux = ['ubuntu','linux','red hat','fedora', 'mint','steamos', 'chrome os']
    existing = []
    if 'windows'in str(requirements).lower():
        existing.append('windows')
    for mac in macos:
        if mac in str(requirements).lower() and 'macOS' not in existing:
            existing.append('macOS')
    for lin in linux:
        if lin in str(requirements).lower() and 'Linux' not in existing:
            existing.append('Linux')
    return " ".join(existing)

# Load Dataset

In [97]:
full_df = pd.read_csv('../raw_data/combine_df.csv')
full_df.isnull().sum()

url                             0
types                           0
name                           14
desc_snippet                13219
recent_reviews              35315
all_reviews                  9551
release_date                  367
developer                     300
publisher                    5000
popular_tags                  133
game_details                  475
languages                      14
achievements                25827
genre                         396
game_description              101
mature_content              35124
minimum_requirements        16952
recommended_requirements    16946
original_price               3022
discount_price              26275
clean_reviews                   0
clean_date                    441
dtype: int64

# Drop unused features

In [98]:
full_df = full_df.drop(
            columns=[
                'recent_reviews', 
                'all_reviews', 
                'types', 
                'release_date',
                'desc_snippet',
                'discount_price',
                'game_details',
            ])

# Drop expansions and hardware

In [99]:
full_df = full_df[
        full_df['game_description']\
                .str.contains('About This Content', case=False) == False]

full_df = full_df[
        full_df['game_description']\
                .str.contains('About This Hardware', case=False) == False]

# Find the missing developers

In [100]:
for index in full_df[full_df['developer'].isnull()].index:
    full_df.loc[index, 'developer'] = full_df.loc[index, 'publisher']

full_df = full_df.drop(columns='publisher')

for index in full_df[full_df['developer'].isnull()].index:
    full_df.loc[index, 'developer'] = get_name(full_df.loc[index, 'url'])

# Create column for the operating systems

In [101]:
for index in full_df[full_df['minimum_requirements'].isnull()].index:
    full_df.loc[index, 'minimum_requirements'] = full_df.loc[index, 'recommended_requirements']

full_df = full_df.drop(columns=['recommended_requirements'])

full_df['op_sys']= full_df['minimum_requirements'].apply(lambda x: operating_system(x))
full_df.drop(columns='minimum_requirements', inplace=True)

# Get Missing names

In [102]:
for index in full_df[full_df['name'].isnull()].index:
    full_df.loc[index, 'name'] = get_name(full_df.loc[index, 'url'])

# Rename Columns

In [103]:
full_df = full_df.rename(columns={
    'popular_tags': 'tags',
    'clean_reviews': 'reviews',
    'clean_date': 'date',
    'original_price': 'price'
})

# Filling missing languages with the most common

In [104]:
full_df['languages'].fillna('English', inplace=True)

# Find missing tags and genre

In [105]:
for index in full_df[full_df['tags'].isnull()].index:
    full_df.loc[index, 'tags'] = get_name(full_df.loc[index, 'url'])

In [106]:
for index in full_df[full_df['genre'].isnull()].index:
    full_df.loc[index, 'genre'] = get_name(full_df.loc[index, 'url'])

# Encode achievements

In [107]:
for i in full_df.index:
    full_df.at[i, 'achievement_cat'] = 0 if math.isnan(full_df.loc[i]['achievements']) == True else 1

In [108]:
full_df = full_df.drop(columns='achievements')

In [109]:
full_df.rename(columns={'achievement_cat': 'achievements'}, inplace=True)

# Encode mature content

In [110]:
full_df['mature_content'] = full_df['mature_content'].apply(
    lambda x: 1 if isinstance(x, str) else 0
)

# Removing demos, trials and convert price to €

In [111]:
full_df['price'] = full_df['price'].astype(str)

In [112]:
full_df['demo'] = full_df['price'].apply(
    lambda x: True if 'demo' in x.lower() else False
)

In [113]:
full_df = full_df[full_df['demo'] == False]

In [114]:
full_df['trial'] = full_df['price'].apply(
    lambda x: True if 'trial' in x.lower() else False
)

In [115]:
full_df = full_df[full_df.trial == False]

In [116]:
for index in full_df[full_df['price'] == 0].index:
    full_df.loc[index, 'price'] = '0'

In [117]:
def price_change(value):
    try:
        return float(value.strip('$'))*0.90
    except:
        return float('NaN')

In [118]:
full_df['price'] = full_df.price.apply(price_change)

In [119]:
full_df = full_df.drop(columns=['demo', 'trial'])

# Fill missing Op Sys

In [131]:
full_df['op_sys']=full_df['op_sys'].replace('', 'no info available')

# Save df to csv

In [121]:
full_df.to_csv('clean_df.csv', index=False)