<a href="https://colab.research.google.com/github/Lemonfry/ds-section1-project/blob/main/section1-project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

게임 개발을 위한 데이터 분석

---


목표 : 제일 잘 팔리는 게임 만들기

가진 데이터 : 무슨 장르의 게임이 어느 년도에 어떤 기종(platform)으로 출시되었으며, 만든 회사는 어디고 북미/유럽/일본/기타 지역에서의 출고 실적(

제일 잘 팔리는 게임 기준 설정하기 :

1. 각 지역마다, 제일 많이 출고된 장르
2. 각 장르마다, 제일 많이 출고된 지역
3. 기종마다, 제일 많이 출고된 지역
4. 연도마다, 제일 많이 출고된 장르
5. 연도마다, 제일 많이 출고된 게임 기종

(분석을 위해, 출고량이 높으면 판매량도 많다고 생각하겠습니다)

진행 과정에 따른 설명은 코드를 참조해 주세요
1. 데이터 전처리(EDA)

In [1]:
# 개인 노트북에 저장한 파일 불러오기
from google.colab import files
myfile = files.upload()

Saving vgames2.csv to vgames2 (1).csv


In [2]:
import io
import pandas as pd
game_data = pd.read_csv(io.BytesIO(myfile['vgames2.csv']))

In [3]:
# 데이터셋 확인(잘 불러와졌는지)
game_data

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,1,Candace Kane's Candy Factory,DS,2008.0,Action,Destineer,0.04,0,0,0
1,2,The Munchables,Wii,2009.0,Action,Namco Bandai Games,0.17,0,0,0.01
2,3,Otome wa Oanesama Boku ni Koi Shiteru Portable,PSP,2010.0,Adventure,Alchemist,0,0,0.02,0
3,4,Deal or No Deal: Special Edition,DS,2010.0,Misc,Zoo Games,0.04,0,0,0
4,5,Ben 10 Ultimate Alien: Cosmic Destruction,PS3,2010.0,Platform,D3Publisher,0.12,0.09,0,0.04
...,...,...,...,...,...,...,...,...,...,...
16593,16594,Ice Age 2: The Meltdown,GC,2006.0,Platform,Vivendi Games,0.15,0.04,0,0.01
16594,16595,Rainbow Islands: Revolution,PSP,2005.0,Action,Rising Star Games,0.01,0,0,0
16595,16596,NBA 2K16,PS3,2015.0,Sports,Take-Two Interactive,0.44,0.19,0.03,0.13
16596,16597,Toukiden: The Age of Demons,PSV,2013.0,Action,Tecmo Koei,0.05,0.05,0.25,0.03


In [4]:
# 현재 주어진 표에서, 게임 이름이나 게임 출판사와 출고량간에 관계가 있는지는 '다음 분기 게임 설계'를 위한 분석을 할 때는 별 필요가 없다고 판단됩니다.
# (EA 등의 소위 덩치가 큰 출판사가 당연히 더 많은 게임을 출고했겠으나, 게임을 만드는 입장에선 딱히 쓸모가 없는 정보입니다.)
# unamed:0와 사용하지 않을 게임 이름(name)과 출판사(publisher) column을 제거합니다
game_data = game_data.drop(['Unnamed: 0','Name','Publisher'], axis = 1)

In [5]:
game_data

Unnamed: 0,Platform,Year,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,DS,2008.0,Action,0.04,0,0,0
1,Wii,2009.0,Action,0.17,0,0,0.01
2,PSP,2010.0,Adventure,0,0,0.02,0
3,DS,2010.0,Misc,0.04,0,0,0
4,PS3,2010.0,Platform,0.12,0.09,0,0.04
...,...,...,...,...,...,...,...
16593,GC,2006.0,Platform,0.15,0.04,0,0.01
16594,PSP,2005.0,Action,0.01,0,0,0
16595,PS3,2015.0,Sports,0.44,0.19,0.03,0.13
16596,PSV,2013.0,Action,0.05,0.05,0.25,0.03


In [6]:
# Sales 항목의 수를 M(백만) 단위로 맞추기 : 여러 게임 관련 기사를 볼 때, 출고량의 기본 단위는 M이며 단위가 누락된 게임들도 M 단위 출고량에 더 가까운 것으로 보입니다.
# (단위 없는 판매량 중 1985 슈퍼마리오의 출고량이 41.49인데, 가디언지 온라인 기사에서 4000만장(40M) 팔렸단 기사로 볼때 M으로 계산 가능할 것으로 생각됨)
# https://www.theguardian.com/technology/gamesblog/2010/sep/13/games-gameculture, 10번항목
# 결론 : 단위 없으면 M으로 계산, K 단위는 M으로 변환합니다.

In [7]:
# M단위 통일을 위해 M단위는 문자만 없애고, K단위 발견시 K는 없애고 0.001을 곱합니다.
game_data.NA_Sales = (game_data.NA_Sales.replace(r'[KM]+$', '', regex=True).astype(float)*game_data.NA_Sales.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [0.001, 1]).astype(float))
game_data.EU_Sales = (game_data.EU_Sales.replace(r'[KM]+$', '', regex=True).astype(float)*game_data.EU_Sales.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [0.001, 1]).astype(float))
game_data.JP_Sales = (game_data.JP_Sales.replace(r'[KM]+$', '', regex=True).astype(float)*game_data.JP_Sales.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [0.001, 1]).astype(float))
game_data.Other_Sales = (game_data.Other_Sales.replace(r'[KM]+$', '', regex=True).astype(float)*game_data.Other_Sales.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [0.001, 1]).astype(float))

In [8]:
game_data

Unnamed: 0,Platform,Year,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,DS,2008.0,Action,0.04,0.00,0.00,0.00
1,Wii,2009.0,Action,0.17,0.00,0.00,0.01
2,PSP,2010.0,Adventure,0.00,0.00,0.02,0.00
3,DS,2010.0,Misc,0.04,0.00,0.00,0.00
4,PS3,2010.0,Platform,0.12,0.09,0.00,0.04
...,...,...,...,...,...,...,...
16593,GC,2006.0,Platform,0.15,0.04,0.00,0.01
16594,PSP,2005.0,Action,0.01,0.00,0.00,0.00
16595,PS3,2015.0,Sports,0.44,0.19,0.03,0.13
16596,PSV,2013.0,Action,0.05,0.05,0.25,0.03


2. 항목분석

(1) 지역별 선호 게임 장르

In [None]:
# 장르 항목의 중복을 제거하고 몇종류가 있는지를 파악하고, 각 장르마다 각 지역에서 얼마나 출고되었는지를 확인합니다.

In [9]:
# 중복되지 않는 장르 목록을 확인합니다.
print(game_data['Genre'].unique())

['Action' 'Adventure' 'Misc' 'Platform' 'Sports' 'Simulation' 'Racing'
 'Role-Playing' 'Puzzle' 'Strategy' 'Fighting' 'Shooter' nan]


In [14]:
# 장르별로 4개 지역에서의 출고량을 확인합니다.
#NA_Sales_Action = game_data.loc[game_data['Genre'] == 'Action', 'NA_Sales'].sum()
#NA_Sales_Adventure = game_data.loc[game_data['Genre'] == 'Adventure', 'NA_Sales'].sum()
#NA_Sales_Misc = game_data.loc[game_data['Genre'] == 'Misc', 'NA_Sales'].sum()
#NA_Sales_Platform = game_data.loc[game_data['Genre'] == 'Platform', 'NA_Sales'].sum()
#NA_Sales_Sports = game_data.loc[game_data['Genre'] == 'Sports', 'NA_Sales'].sum()
#NA_Sales_Simulation = game_data.loc[game_data['Genre'] == 'Simulation', 'NA_Sales'].sum()
#NA_Sales_Racing = game_data.loc[game_data['Genre'] == 'Racing', 'NA_Sales'].sum()
#NA_Sales_Role_Playing = game_data.loc[game_data['Genre'] == 'Role-Playing', 'NA_Sales'].sum()
#NA_Sales_Puzzle = game_data.loc[game_data['Genre'] == 'Puzzle', 'NA_Sales'].sum()
#NA_Sales_Strategy = game_data.loc[game_data['Genre'] == 'Strategy', 'NA_Sales'].sum()
#NA_Sales_Fighting = game_data.loc[game_data['Genre'] == 'Fighting', 'NA_Sales'].sum()
#NA_Sales_Shooter = game_data.loc[game_data['Genre'] == 'Shooter', 'NA_Sales'].sum()

In [20]:
import numpy as np

Genre_list = game_data['Genre'].unique()
Genre_list = Genre_list[~np.isnan(Genre_list)]
Genre_list

TypeError: ignored