전처리

In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mplcursors # Use this is for creating a cursor-interactive plot with "%matplotlib notebook"
from sklearn.decomposition import NMF # Use this for training Non-negative Matrix Factorization
from sklearn.utils.extmath import randomized_svd # Use this for training Singular Value Decomposition
from sklearn.manifold import TSNE # Use this for training t-sne manifolding

plt.style.use('ggplot') # You can also use different style

# just for plot checking, use this option
# %matplotlib inline

# for interactive plot
# If you use this option, plot will appear at first-drawn position
%matplotlib notebook

warnings.filterwarnings('ignore')

예제 데이터셋 불러오기
데이터셋 출처 https://www.kaggle.com/datasets/deepcontractor/top-video-games-19952021-metacritic
데이터셋 1995년부터 2021년까지의 자료
all_games.csv의 칼럼
names platform release_date summary(게임 요약) meta_score(전문가 스코어 100점만점) user_review(유저스코어 10점만점)


In [10]:
dir = './dataset/'
df_games=pd.read_csv(dir+'all_games.csv',usecols=['name','platform','release_date','meta_score','user_review'])

게임수 파악
6546개의 플랫폼이 다른 동일 게임이 존재 

In [11]:
n_games=len(df_games['name'])
n_game_same_name=len(df_games['name'].unique())
n_games,n_game_same_name,n_games-n_game_same_name

(18800, 12254, 6546)

이름이없는 row 존재하지않음

In [12]:
len(df_games)-n_games

0

칼럼 타입 확인하기

In [13]:
df_games.dtypes

name            object
platform        object
release_date    object
meta_score       int64
user_review     object
dtype: object

user_review칼럼에서 tbd(미정)값 존재 확인

In [6]:
df_games.astype({'name':'string','platform':'string','release_date':'string','user_review':'string'})

Unnamed: 0,name,platform,release_date,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999",98,8.4
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",98,7.9
...,...,...,...,...,...
18795,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",22,1.3
18796,Drake of the 99 Dragons,Xbox,"November 3, 2003",22,1.7
18797,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015",21,2.9
18798,Infestation: Survivor Stories (The War Z),PC,"October 15, 2012",20,1.7


user review칼럼에서 tbd값이 포함된 row처리?

일단 0.0점으로 변경(알고리즘에 따라 달라질듯)

In [7]:
df_games.loc[df_games.user_review=='tbd','user_review']='0.0'

tbd값 처리후 데이터타입 float으로 변경

In [8]:
df_games.astype({'user_review':'float'})

Unnamed: 0,name,platform,release_date,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999",98,8.4
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",98,7.9
...,...,...,...,...,...
18795,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",22,1.3
18796,Drake of the 99 Dragons,Xbox,"November 3, 2003",22,1.7
18797,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015",21,2.9
18798,Infestation: Survivor Stories (The War Z),PC,"October 15, 2012",20,1.7


크롤링한 데이터 전처리

In [48]:
def datecut(x):
    return x[-4:]
dir2="../data/"
pl=["ps4","switch","pc","ps5","xboxone","xbox-series-x"]
df_crawl_game=pd.DataFrame()
for x in pl:
    df_games2=pd.read_csv(dir2+x+'.csv')
    df_games2.loc[df_games2.userscore=='None','userscore']='tbd'
    df_games2.loc[df_games2.userscore=='tbd','userscore']=df_games2.loc[df_games2.userscore!='tbd']['userscore'].astype({'userscore':float}).describe()['mean']
    df_games2=df_games2.astype({'userscore':'float'})   #유저스코어 연속형 데이터로 변경
    df_games2['platform']=x
    print(len(df_games2.loc[df_games2.userscore=='tbd'])) #아직도 tbd값이 존재하는지 확인
    df_crawl_game=pd.concat([df_crawl_game,df_games2])
    df_games2.to_csv(dir2+x+'.csv',sep=',',float_format='%.2f',index=False)
df_crawl_game=df_crawl_game[['title','metascore','userscore','genres','publisher','developer','release_date','summary','rating','platform']]
df_crawl_game['release_date']=df_crawl_game['release_date'].apply(datecut)

0
0
0
0
0
0


최종 합쳐진 데이터셋 확인

In [49]:
df_crawl_game

Unnamed: 0,title,metascore,userscore,genres,publisher,developer,release_date,summary,rating,platform
0,Surgeon Simulator: Experience Reality,43,4.40,Simulation|Virtual|Career|,Bossa Studios,Bossa Studios,2016,This game is ridiculously fun once you've plac...,T,ps4
1,Transformers: Rise of the Dark Spark,43,4.20,Action Adventure|Sci-Fi|General|,Activision,Edge of Reality,2014,Fight your way through both Earth and Cybertro...,T,ps4
2,NBA Live 14,43,2.60,Arcade|Sports|Traditional|Team|Basketball|Arcade|,Electronic Arts,EA Sports,2013,BounceTek -- Revolutionary physics-based dribb...,E,ps4
3,WWE 2K20,43,1.60,Sports|Individual|Combat|Wrestling|,2K Games,Visual Concepts,2019,F the haters. This game is still fun. And it’...,T,ps4
4,VR Karts,42,5.60,Racing|Arcade|Other|,Viewpoint Games,Viewpoint Games,2017,The checkered flag is about to drop on a new V...,E,ps4
...,...,...,...,...,...,...,...,...,...,...
209,Road 96,78,6.70,Adventure|3D|First-Person|,Merge Games,Digixart Entertainment,2022,"<span class=""blurb blurb_collapsed"">Summer 199...",T,xbox-series-x
210,WRC 9 FIA World Rally Championship,78,6.00,Racing|Simulation|Automobile|,Nacon,KT Racing,2020,"<span class=""blurb blurb_collapsed"">The game r...",E,xbox-series-x
211,Shadow Warrior 3,78,6.00,Action|First-Person|Shooter|Arcade|,Devolver Digital,Flying Wild Hog,2022,Shadow Warrior 3 launches the offbeat first-pe...,,xbox-series-x
212,Blast Brigade vs. the Evil Legion of Dr. Cread,78,6.53,Action|Platformer|2D|Metroidvania|,MY.GAMES,Allods Team Arcade,2022,"<span class=""blurb blurb_collapsed"">Blast Brig...",E10+,xbox-series-x


summary칼럼에 태그가 남아있는문제

In [50]:
df_crawl_game['summary']=df_crawl_game['summary'].str.replace(pat=r'<[^>]*>',repl=r' ',regex=True)

In [51]:
df_crawl_game

Unnamed: 0,title,metascore,userscore,genres,publisher,developer,release_date,summary,rating,platform
0,Surgeon Simulator: Experience Reality,43,4.40,Simulation|Virtual|Career|,Bossa Studios,Bossa Studios,2016,This game is ridiculously fun once you've plac...,T,ps4
1,Transformers: Rise of the Dark Spark,43,4.20,Action Adventure|Sci-Fi|General|,Activision,Edge of Reality,2014,Fight your way through both Earth and Cybertro...,T,ps4
2,NBA Live 14,43,2.60,Arcade|Sports|Traditional|Team|Basketball|Arcade|,Electronic Arts,EA Sports,2013,BounceTek -- Revolutionary physics-based dribb...,E,ps4
3,WWE 2K20,43,1.60,Sports|Individual|Combat|Wrestling|,2K Games,Visual Concepts,2019,F the haters. This game is still fun. And it’...,T,ps4
4,VR Karts,42,5.60,Racing|Arcade|Other|,Viewpoint Games,Viewpoint Games,2017,The checkered flag is about to drop on a new V...,E,ps4
...,...,...,...,...,...,...,...,...,...,...
209,Road 96,78,6.70,Adventure|3D|First-Person|,Merge Games,Digixart Entertainment,2022,Summer 1996. Today is the day! You hit the ro...,T,xbox-series-x
210,WRC 9 FIA World Rally Championship,78,6.00,Racing|Simulation|Automobile|,Nacon,KT Racing,2020,The game runs nicely 60fps xbox series S- The...,E,xbox-series-x
211,Shadow Warrior 3,78,6.00,Action|First-Person|Shooter|Arcade|,Devolver Digital,Flying Wild Hog,2022,Shadow Warrior 3 launches the offbeat first-pe...,,xbox-series-x
212,Blast Brigade vs. the Evil Legion of Dr. Cread,78,6.53,Action|Platformer|2D|Metroidvania|,MY.GAMES,Allods Team Arcade,2022,Blast Brigade vs. The Evil Legion of Dr. Crea...,E10+,xbox-series-x


특수문자제거

In [52]:
df_crawl_game['summary']= df_crawl_game['summary'].str.replace(pat=r'[^\w]',repl=r' ',regex=True)
df_crawl_game

Unnamed: 0,title,metascore,userscore,genres,publisher,developer,release_date,summary,rating,platform
0,Surgeon Simulator: Experience Reality,43,4.40,Simulation|Virtual|Career|,Bossa Studios,Bossa Studios,2016,This game is ridiculously fun once you ve plac...,T,ps4
1,Transformers: Rise of the Dark Spark,43,4.20,Action Adventure|Sci-Fi|General|,Activision,Edge of Reality,2014,Fight your way through both Earth and Cybertro...,T,ps4
2,NBA Live 14,43,2.60,Arcade|Sports|Traditional|Team|Basketball|Arcade|,Electronic Arts,EA Sports,2013,BounceTek Revolutionary physics based dribb...,E,ps4
3,WWE 2K20,43,1.60,Sports|Individual|Combat|Wrestling|,2K Games,Visual Concepts,2019,F the haters This game is still fun And it ...,T,ps4
4,VR Karts,42,5.60,Racing|Arcade|Other|,Viewpoint Games,Viewpoint Games,2017,The checkered flag is about to drop on a new V...,E,ps4
...,...,...,...,...,...,...,...,...,...,...
209,Road 96,78,6.70,Adventure|3D|First-Person|,Merge Games,Digixart Entertainment,2022,Summer 1996 Today is the day You hit the ro...,T,xbox-series-x
210,WRC 9 FIA World Rally Championship,78,6.00,Racing|Simulation|Automobile|,Nacon,KT Racing,2020,The game runs nicely 60fps xbox series S The...,E,xbox-series-x
211,Shadow Warrior 3,78,6.00,Action|First-Person|Shooter|Arcade|,Devolver Digital,Flying Wild Hog,2022,Shadow Warrior 3 launches the offbeat first pe...,,xbox-series-x
212,Blast Brigade vs. the Evil Legion of Dr. Cread,78,6.53,Action|Platformer|2D|Metroidvania|,MY.GAMES,Allods Team Arcade,2022,Blast Brigade vs The Evil Legion of Dr Crea...,E10+,xbox-series-x


In [53]:
df_crawl_game.to_csv(dir2+'all_game.csv',sep=',',float_format='%.2f',index=False)