# Scraped Data Processing

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("test_data.csv")
df.head()

Unnamed: 0,name,genres,bundle_id
0,Nioh 2 – The Complete Edition,"Action, RPG",-1
1,Rise of the Ronin,"Action, Adventure, RPG",-1
2,NINJA GAIDEN 2 Black,"Action, Adventure",-1
3,The Surge,"Action, RPG",-3
4,The Surge 2,"Action, RPG",-3


In [5]:
df["genres"] = df["genres"].str.strip().str.split(", ")
df

Unnamed: 0,name,genres,bundle_id
0,Nioh 2 – The Complete Edition,"[Action, RPG]",-1
1,Rise of the Ronin,"[Action, Adventure, RPG]",-1
2,NINJA GAIDEN 2 Black,"[Action, Adventure]",-1
3,The Surge,"[Action, RPG]",-3
4,The Surge 2,"[Action, RPG]",-3
...,...,...,...
11678,Inscryption,"[Adventure, Indie, Strategy]",-2876
11679,Pony Island,[Indie],-2876
11680,The Hex,[Indie],-2876
11681,Inscryption,"[Adventure, Indie, Strategy]",-2877


In [None]:
# df.to_csv("new_bundle.csv", index=False)

In [9]:
import plotly.express as px

## genres

In [36]:
genres = df["genres"].sum()

fig = px.histogram(genres, nbins=100)
fig.update_layout(
    xaxis={
        "categoryorder": "total descending"
    },
    title="Games by Genre<br><sup>Items may include no genre (e.g. music) or multiple genres</sup>",
    xaxis_title="Genre",
    yaxis_title="Number of Items",
    showlegend=False
)
fig.show()

## Genre-less Soundtracks

In [49]:
df[df["genres"].apply(lambda x: "" in x)]

Unnamed: 0,name,genres,bundle_id
984,Sniper Ghost Warrior Contracts 2 Soundtrack,[],-56
1129,ELEX II Soundtrack,[],-90
1142,ELEX II Soundtrack,[],-93
1174,Valheim Soundtrack,[],-106
1191,ARK: Survival Evolved Original Soundtrack,[],-112
...,...,...,...
11599,Return of the Obra Dinn - Soundtrack,[],-2852
11649,Darkest Dungeon® II: The Soundtrack,[],-2866
11655,Darkest Dungeon® II: The Soundtrack,[],-2869
11667,Darkest Dungeon® II: The Soundtrack,[],-2870


## Distribution of Items per Bundle

In [68]:
item_df = df.copy()
item_df["genres"] = item_df["genres"].apply(lambda x: [] if "" in x else x)
num_item_genres = item_df["genres"].apply(len).value_counts() * 100 / item_df.shape[0]

fig = px.bar(num_item_genres)
fig.update_layout(
    title="Genres per Item",
    xaxis=dict(
        title="Number of Genres",
        tickmode="linear"
    ),
    yaxis_title = f"% of items",
    showlegend=False
)
fig.show()