## **데이터 불러오기**

In [18]:
import pandas as pd
url = 'https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/datasets/vgames2.csv'
df = pd.read_csv(url)

In [19]:
df = df.fillna(0)
df.isnull().sum()
df.drop(columns='Unnamed: 0',inplace = True)

In [20]:
df.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,Candace Kane's Candy Factory,DS,2008.0,Action,Destineer,0.04,0.0,0.0,0.0
1,The Munchables,Wii,2009.0,Action,Namco Bandai Games,0.17,0.0,0.0,0.01
2,Otome wa Oanesama Boku ni Koi Shiteru Portable,PSP,2010.0,Adventure,Alchemist,0.0,0.0,0.02,0.0
3,Deal or No Deal: Special Edition,DS,2010.0,Misc,Zoo Games,0.04,0.0,0.0,0.0
4,Ben 10 Ultimate Alien: Cosmic Destruction,PS3,2010.0,Platform,D3Publisher,0.12,0.09,0.0,0.04


## **데이터 전처리(지역값 단위 통일)**

단위 : 백만(million)

Type : Float

In [21]:
# '지역에 따라서 선호하는 게임 장르가 다를까'
country_sales = df.iloc[:,5:]
country_sales

Unnamed: 0,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,0.04,0,0,0
1,0.17,0,0,0.01
2,0,0,0.02,0
3,0.04,0,0,0
4,0.12,0.09,0,0.04
...,...,...,...,...
16593,0.15,0.04,0,0.01
16594,0.01,0,0,0
16595,0.44,0.19,0.03,0.13
16596,0.05,0.05,0.25,0.03


In [22]:
lst = []
def toFloat(column) :
  for i in range(len(column)) :
    data = column[i].lower()
    if data.find('k') != -1 : 
      column[i] = float(data.rstrip('k')) * 0.001
    elif data.find('m') != -1 :
      column[i] = data.rstrip('m')
  return column.astype(float)

def toCount(data) :
  return data*1000000
country_sales = (country_sales
                 .apply(toFloat)
                 .apply(toCount)
                 .astype(int)
)

In [23]:
country_sales.dtypes

NA_Sales       int64
EU_Sales       int64
JP_Sales       int64
Other_Sales    int64
dtype: object

## **데이터 전처리(연도)**

In [24]:
year = df['Year']
def reset_year(data) :
  if data < 1000 :
    if data == 0 : data
    elif data < 21 : data += 2000
    elif data < 100 : data += 1900
  return int(data)
year = year.apply(reset_year)

## **데이터 합치기(전처리 데이터들 종합)**

In [25]:
df['Year'] = year
df = df.drop(['NA_Sales','EU_Sales','JP_Sales','Other_Sales'],axis=1)
df = pd.concat([df, country_sales], axis=1)
df

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,Candace Kane's Candy Factory,DS,2008,Action,Destineer,40000,0,0,0
1,The Munchables,Wii,2009,Action,Namco Bandai Games,170000,0,0,10000
2,Otome wa Oanesama Boku ni Koi Shiteru Portable,PSP,2010,Adventure,Alchemist,0,0,20000,0
3,Deal or No Deal: Special Edition,DS,2010,Misc,Zoo Games,40000,0,0,0
4,Ben 10 Ultimate Alien: Cosmic Destruction,PS3,2010,Platform,D3Publisher,120000,90000,0,40000
...,...,...,...,...,...,...,...,...,...
16593,Ice Age 2: The Meltdown,GC,2006,Platform,Vivendi Games,150000,40000,0,10000
16594,Rainbow Islands: Revolution,PSP,2005,Action,Rising Star Games,10000,0,0,0
16595,NBA 2K16,PS3,2015,Sports,Take-Two Interactive,440000,190000,30000,130000
16596,Toukiden: The Age of Demons,PSV,2013,Action,Tecmo Koei,50000,50000,250000,30000


In [26]:
# 지역에 따라서 선호하는 게임 장르가 다를까 ==> 장르와 지역은 상관이 없다
# [['Genre'],['NA_Sales'],['EU_Sales'],['JP_Sales'],['Other_Sales']]
df_groupby_Genre  = (df
                     .loc[:, ['Genre', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales']]
                     .groupby(by=['Genre'])
                     .sum()
                     .dropna()
)
df_groupby_Genre = df_groupby_Genre[1:].T
df_groupby_Genre

Genre,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
NA_Sales,874809996,105660000,223199998,408049999,446439999,123349999,357189997,319539999,581919997,182959998,680519996,68700000
EU_Sales,522159997,64050000,101220000,212360000,201609999,50730000,237099999,183579998,313049996,113350000,376479999,45320000
JP_Sales,157900000,51950000,87340000,105270000,130649999,56870000,56400000,344949999,38280000,63700000,135369999,49460000
Other_Sales,186449998,16810000,36640000,74039999,51530000,12520000,77100000,56960000,102650000,31480000,134669999,11360000


In [28]:
from scipy.stats import chi2_contingency
df_Relation_GenreCountry = chi2_contingency(df_groupby_Genre, correction = False)
df_Relation_GenreCountry

(696795972.6335288,
 0.0,
 33,
 array([[8.58968946e+08, 1.17633936e+08, 2.21189486e+08, 3.94490758e+08,
         4.09540917e+08, 1.20100366e+08, 3.59008688e+08, 4.46438716e+08,
         5.10995067e+08, 1.93116574e+08, 6.54610382e+08, 8.62461416e+07],
        [4.75620013e+08, 6.51351303e+07, 1.22474912e+08, 2.18433624e+08,
         2.26767053e+08, 6.65008182e+07, 1.98786834e+08, 2.47197747e+08,
         2.82943268e+08, 1.06930650e+08, 3.62464557e+08, 4.77553830e+07],
        [2.51097255e+08, 3.43872250e+07, 6.46589996e+07, 1.15319124e+08,
         1.19718647e+08, 3.51082218e+07, 1.04946863e+08, 1.30504760e+08,
         1.49376132e+08, 5.64526132e+07, 1.91358338e+08, 2.52118188e+07],
        [1.55633777e+08, 2.13137086e+07, 4.00766003e+07, 7.14764917e+07,
         7.42033807e+07, 2.17605930e+07, 6.50476113e+07, 8.08887725e+07,
         9.25855267e+07, 3.49901611e+07, 1.18606716e+08, 1.56266566e+07]]))

## **데이터 전처리**