# Extract data

This extracts the data for the user OilyBurger

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src'))

import extract as ex
from dotenv import load_dotenv

load_dotenv()

owned_games = ex.get_owned_games("76561198235391392")
games = ex.get_games_info(owned_games)

games_data = []
for game in games:
    web_page = ex.get_html(game["id"])
    game_genres = ex.extract_genre(web_page)
    games_data.append({
        'Game_Id': game['id'],
        'Game_Name': game['name'],
        'Game_Genres': game_genres,
        'Playtime_Minutes': game['playtime']
        })
games_data.append({
        'Game_Id': 1111111,
        'Game_Name': "Dragon's poop",
        'Game_Genres': "NULL",
        'Playtime_Minutes': 100
        })

ModuleNotFoundError: No module named 'src'

# Cleaning the data

The data is converted into a data frame and then cleaned.

## Flattening the data

The data is first flattened so the list of genres occupy separate rows for each game

In [2]:
import pandas as pd

data = pd.DataFrame(games_data)
# Flatten the data
def flatten_data(data: pd.DataFrame):
    data = data.explode("Game_Genres")
    data.reset_index(drop=True)
    return data
    
data = flatten_data(data)
data

Unnamed: 0,Game_Id,Game_Name,Game_Genres,Playtime_Minutes
0,105600,Terraria,Action,34
0,105600,Terraria,Adventure,34
0,105600,Terraria,Indie,34
0,105600,Terraria,RPG,34
1,218680,Scribblenauts Unlimited,Adventure,90
...,...,...,...,...
61,2073850,THE FINALS,Free to Play,6767
62,1938090,Call of Duty®,Action,1792
63,1364780,Street Fighter™ 6,Action,1168
63,1364780,Street Fighter™ 6,Adventure,1168


## Removing weird values
After flattening, we now need to look at any wacky data.

In [3]:
# From this, you can see that '\n\nUnder £7\n\n\nUnder £4\n\n' is clearly an anomaly so we remove it.
# There is also a "NULL" for one of the genres so we remove that too.
data["Game_Genres"].unique()

array(['Action', 'Adventure', 'Indie', 'RPG', 'Casual', 'Strategy',
       'Free to Play', 'Massively Multiplayer', 'Simulation',
       'Animation & Modeling', 'Design & Illustration',
       'Video Production', '\n\nUnder £7\n\n\nUnder £4\n\n',
       'Photo Editing', 'Utilities', 'Early Access', 'NULL'], dtype=object)

In [6]:
data = data.drop(data[data["Game_Genres"] == "\n\nUnder £7\n\n\nUnder £4\n\n"].index)
data = data.drop(data[data["Game_Genres"] == "NULL"].index)
data["Game_Genres"].unique()


array(['Action', 'Adventure', 'Indie', 'RPG', 'Casual', 'Strategy',
       'Free to Play', 'Massively Multiplayer', 'Simulation',
       'Animation & Modeling', 'Design & Illustration',
       'Video Production', 'Photo Editing', 'Utilities', 'Early Access'],
      dtype=object)


The game genres are now clean but now we need to remove games that have 0 minutes of playtime.

In [17]:
df = data[data["Playtime_Minutes"] == 0]
df

Unnamed: 0,Game_Id,Game_Name,Game_Genres,Playtime_Minutes
17,407530,ARK: Survival Of The Fittest,Action,0
17,407530,ARK: Survival Of The Fittest,Adventure,0
17,407530,ARK: Survival Of The Fittest,Indie,0
17,407530,ARK: Survival Of The Fittest,Massively Multiplayer,0
17,407530,ARK: Survival Of The Fittest,RPG,0
17,407530,ARK: Survival Of The Fittest,Strategy,0
18,365670,Blender,Animation & Modeling,0
18,365670,Blender,Design & Illustration,0
18,365670,Blender,Video Production,0
21,261570,Ori and the Blind Forest,Action,0


In [18]:
data = data.loc[~((data["Playtime_Minutes"] == 0))].reset_index(drop=True)
df = data[data["Playtime_Minutes"] == 0]
df

Unnamed: 0,Game_Id,Game_Name,Game_Genres,Playtime_Minutes
