In [None]:
# Upload file

from google.colab import files
files.upload()

In [None]:
# Dataframe
import pandas as pd

file_data = "/content/vgames2.csv"
data = pd.read_csv(file_data)

# data.dropna(inplace=True)
data = data[~(data['Year'].isna() | data["Genre"].isna())]
data.drop("Unnamed: 0", axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

data.head()

In [None]:
# Feature

name = data.Name
platform = data.Platform
year = data.Year.astype(int)
genre = data.Genre
publisher = data.Publisher
sales = data.loc[:, "NA_Sales":]

In [None]:
# Data preprocessing

def unit_to_num(val):
  a = 1

  if 'K' in val:
    val = val.replace('K','')
    a = pow(10, 3)
  elif 'M' in val:
    val = val.replace('M','')
    a = pow(10, 6)
  return float(val) * a # 1 = 100

def num_to_year(year):
  i = 0
  y = year

  if y < 1000:
    if (y % 100) > 70:
        y = 1900 + y
    else:
        y = 2000 + y
  return y

def group_platform(platform):
  nintendo = ['DS', 'Wii', 'GBA', 'GC', '3DS', 'GB', 'NES', 'N64', 'WiiU']
  ps = ['PSP', 'PS3', 'PS', 'PS4', 'PS2', 'PSV']
  xbox = ['XB', 'X360', 'XOne']
  res = platform

  if platform in nintendo:
    res = "Nintendo"
  elif platform in ps:
    res = "PlayStation"
  elif platform in xbox:
    res = "Xbox"
  else:
    if platform != "PC":
      res = "Etc"
  
  return res


# year를 모두 4자리수로
year = year.apply(num_to_year)

# platform을 시리즈끼리 묶음
platform = platform.apply(group_platform)

In [None]:
# df = pd.concat([name, year, sales], axis = 1)
# df.iloc[72]

In [None]:
# 지역에 따라 선호하는 장르

list_res = []
list_genre = []

# sales를 numeric한 데이터로
for col in sales.columns:
  sales[col] = sales[col].apply(unit_to_num)
df = pd.concat([genre.copy(),sales.copy()], axis = 1)

# List of genre
for s in genre:
  if s in list_genre:
    continue
  list_genre.append(s)

# genre 별 sales의 합
for genre in list_genre:
  condition = (df["Genre"] == genre)
  tmp = df[condition].drop("Genre", axis = 1)
  list_res.append(tmp.sum())

df_res = pd.DataFrame(list_res)
df_genre = pd.DataFrame(list_genre, columns=["Genre"])
df_res = pd.concat([df_genre, df_res], axis = 1)
df_res

In [None]:
# Ploting multiple horizontal bar chart

from matplotlib import pyplot as plt
import numpy as np

# df_horizon = df_res.set_index("Genre")
# ax = df_horizon.plot.barh()

ind = np.arange(len(df_res))
width = 0.2

na = df_res.NA_Sales
eu = df_res.EU_Sales
jp = df_res.JP_Sales
other = df_res.Other_Sales

fig, ax = plt.subplots(figsize=(6, 8))
ax.barh(ind, na, width, color='navy', label='NA')
ax.barh(ind+width, eu, width, color='khaki', label='EU')
ax.barh(ind+width*2, jp, width, color='red', label='JP')
ax.barh(ind+width*3, other, width, color='pink', label='Other')

ax.set_ylabel("Country")
ax.set_xlabel("Sales(1 = 10 Billion)")
ax.set_yticks(ind+width*(3/2))#((ind+(width*2))/2)
ax.set_yticklabels(np.array(df_res.Genre))
ax.legend()

plt.show()

In [None]:
# 연도별 게임의 트렌드 분석

genre_2 = data.Genre
sales_2 = data.loc[:, "NA_Sales":]

# sales의 값들의 합계
for col in sales_2.columns:
  sales_2[col] = sales_2[col].apply(unit_to_num)
sales_2 = sales_2.sum(axis=1)

df_2 = pd.concat([year, platform, genre_2, sales_2], axis=1)
df_2.columns = ["Year", "Platform", "Genre", "Sales"]
df_2

In [None]:
# 'Year'를 연도별로 묶음

def group_years(year):
  res = None

  if (year % 1980) < 10:
    res = "1980s"
  elif (year % 1990) < 10:
    res = "1990s"
  elif (year % 2000) < 10:
    res = "2000s"
  elif (year % 2010) < 10:
    res = "2010s"
  else:
    res = "2020s"
  return res

df_2.Year = df_2.Year.apply(group_years)
df_2

In [None]:
# Counting 'platform' and 'genre' (분포)
count_p = pd.crosstab(df_2.Year, df_2.Platform, margins=True)
count_g = pd.crosstab(df_2.Year, df_2.Genre, margins=True)

count_p = count_p.drop(["All"], axis=1)
count_p = count_p.drop(["2020s", "All"])

count_g = count_g.drop(["All"], axis=1)
count_g = count_g.drop(["2020s", "All"])

# count_p, count_g
count_p, count_g

In [None]:
# Ploting Line charts (Platform)

for PF in count_p.columns:
  plt.plot(count_p.index[:4], count_p[PF], marker='o', label = PF)
plt.legend(loc='upper left')
plt.figure(figsize=(8,6))
plt.show()

In [None]:
# Ploting Line charts (Genre)

for G in count_g.columns:
  plt.plot(count_g.index[:4], count_g[G], marker='o', label = G)
plt.legend(loc='upper left')
plt.figure(figsize=(8,6))
plt.show()

In [None]:
# 출고량이 높은 게임에 대한 분석

df_3 = pd.concat([name, year, platform, genre_2, sales_2], axis=1)
df_3.columns = ["Name", "Year", "Platform", "Genre", "Sales"]

# 00년도 or 10년도에서 총 출고량 상위 50개를 고름
df_3.sort_values(by=['Sales'], ascending=False, inplace=True)
df_3["Year"] = df_3["Year"].apply(group_years)

condition = ((df_3.Year == "2000s") | (df_3.Year == "2010s"))
top_games = df_3[condition].head(50)
top_games

In [None]:
top_21th = pd.crosstab([top_games.Year, top_games.Platform], df_2.Genre, margins=True)
top_21th.drop(["All"], axis=1, inplace=True)
top_21th.drop(["All"], inplace=True) 
top_21th

In [None]:
import seaborn as sns

ax = sns.heatmap(top_21th, annot=True, fmt='d')
plt.title("Heatmap of Game Sales of Top 50 in 2000 ~ 2019")
sns.set(rc = {'figure.figsize':(15,8)})
plt.show