## **데이터 불러오기**

In [118]:
import pandas as pd
url = 'https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/datasets/vgames2.csv'
df = pd.read_csv(url)

In [119]:
df = df.fillna(0)
df.isnull().sum()
df.drop(columns='Unnamed: 0',inplace = True)

In [120]:
df.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,Candace Kane's Candy Factory,DS,2008.0,Action,Destineer,0.04,0.0,0.0,0.0
1,The Munchables,Wii,2009.0,Action,Namco Bandai Games,0.17,0.0,0.0,0.01
2,Otome wa Oanesama Boku ni Koi Shiteru Portable,PSP,2010.0,Adventure,Alchemist,0.0,0.0,0.02,0.0
3,Deal or No Deal: Special Edition,DS,2010.0,Misc,Zoo Games,0.04,0.0,0.0,0.0
4,Ben 10 Ultimate Alien: Cosmic Destruction,PS3,2010.0,Platform,D3Publisher,0.12,0.09,0.0,0.04


## **데이터 전처리(지역값 단위 통일)**

단위 : 백만(million)

Type : Float

In [121]:
# '지역에 따라서 선호하는 게임 장르가 다를까'
country_sales = df.iloc[:,5:]
country_sales

Unnamed: 0,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,0.04,0,0,0
1,0.17,0,0,0.01
2,0,0,0.02,0
3,0.04,0,0,0
4,0.12,0.09,0,0.04
...,...,...,...,...
16593,0.15,0.04,0,0.01
16594,0.01,0,0,0
16595,0.44,0.19,0.03,0.13
16596,0.05,0.05,0.25,0.03


In [122]:
lst = []
def toFloat(column) :
  for i in range(len(column)) :
    data = column[i].lower()
    if data.find('k') != -1 : 
      column[i] = float(data.rstrip('k')) * 0.001
    elif data.find('m') != -1 :
      column[i] = data.rstrip('m')
  return column.astype(float)
country_sales = (country_sales
                 .apply(toFloat)
                 .astype(float)
)

In [123]:
country_sales.dtypes

NA_Sales       float64
EU_Sales       float64
JP_Sales       float64
Other_Sales    float64
dtype: object

## **데이터 전처리(연도)**

In [124]:
year = df['Year']
def reset_year(data) :
  if data < 1000 :
    if data == 0 : data
    elif data < 21 : data += 2000
    elif data < 100 : data += 1900
  return int(data)
year = year.apply(reset_year)

## **데이터 합치기(전처리 데이터들 종합)**

In [125]:
df['Year'] = year
df = df.drop(['NA_Sales','EU_Sales','JP_Sales','Other_Sales'],axis=1)
df = pd.concat([df, country_sales], axis=1)
df

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,Candace Kane's Candy Factory,DS,2008,Action,Destineer,0.04,0.00,0.00,0.00
1,The Munchables,Wii,2009,Action,Namco Bandai Games,0.17,0.00,0.00,0.01
2,Otome wa Oanesama Boku ni Koi Shiteru Portable,PSP,2010,Adventure,Alchemist,0.00,0.00,0.02,0.00
3,Deal or No Deal: Special Edition,DS,2010,Misc,Zoo Games,0.04,0.00,0.00,0.00
4,Ben 10 Ultimate Alien: Cosmic Destruction,PS3,2010,Platform,D3Publisher,0.12,0.09,0.00,0.04
...,...,...,...,...,...,...,...,...,...
16593,Ice Age 2: The Meltdown,GC,2006,Platform,Vivendi Games,0.15,0.04,0.00,0.01
16594,Rainbow Islands: Revolution,PSP,2005,Action,Rising Star Games,0.01,0.00,0.00,0.00
16595,NBA 2K16,PS3,2015,Sports,Take-Two Interactive,0.44,0.19,0.03,0.13
16596,Toukiden: The Age of Demons,PSV,2013,Action,Tecmo Koei,0.05,0.05,0.25,0.03


In [126]:
# 지역에 따라서 선호하는 게임 장르가 다를까 ==> 장르와 지역은 상관이 없다
# [['Genre'],['NA_Sales'],['EU_Sales'],['JP_Sales'],['Other_Sales']]
df_groupby_Genre  = (df
                     .loc[:, ['Genre', 'NA_Sales','EU_Sales','JP_Sales','Other_Sales']]
                     .groupby(by=['Genre'])
                     .sum()
                     .dropna()
)
df_groupby_Genre = df_groupby_Genre[1:].T
df_groupby_Genre

Genre,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
NA_Sales,874.81,105.66,223.2,408.05,446.44,123.35,357.19,319.54,581.92,182.96,680.52,68.7
EU_Sales,522.16,64.05,101.22,212.36,201.61,50.73,237.1,183.58,313.05,113.35,376.48,45.32
JP_Sales,157.9,51.95,87.34,105.27,130.65,56.87,56.4,344.95,38.28,63.7,135.37,49.46
Other_Sales,186.45,16.81,36.64,74.04,51.53,12.52,77.1,56.96,102.65,31.48,134.67,11.36


In [152]:
from scipy.stats import chi2_contingency
df_Relation_GenreCountry = chi2_contingency(df_groupby_Genre, correction = True)
df_Relation_GenreCountry

(696.7959761665393,
 2.508335032354097e-125,
 33,
 array([[858.96895075, 117.63393614, 221.18948701, 394.49075948,
         409.54091837, 120.10036664, 359.00869034, 446.43871862,
         510.99507046, 193.11657509, 654.61038546,  86.24614163],
        [475.6200157 ,  65.13513033, 122.47491273, 218.43362447,
         226.76705352,  66.50081847, 198.78683483, 247.19774815,
         282.94326963, 106.93065028, 362.46455886,  47.75538301],
        [251.09725564,  34.38722495,  64.65899974, 115.31912416,
         119.71864709,  35.10822183, 104.94686312, 130.50476034,
         149.37613254,  56.45261331, 191.35833857,  25.21181872],
        [155.6337779 ,  21.31370858,  40.07660052,  71.47649189,
          74.20338101,  21.76059306,  65.04761171,  80.88877289,
          92.58552738,  34.99016132, 118.6067171 ,  15.62665663]]))

## **데이터 전처리**