### Importing Modules

In [1]:
import re
import string 
import pandas as pd
from sklearn import linear_model
import seaborn as sns
import numpy as np
import re
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.inspection import partial_dependence
from sklearn.inspection import PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.pipeline import Pipeline
import requests
from bs4 import BeautifulSoup

### Import Video Game Sale Dataset

In [2]:
# publicly available from kaggle

df = pd.read_csv('Data/vgsales.csv')

### Checking Missing Values

In [3]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
# get the number of missing data points per column
missing_values_count = df.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [15]:
# how many total missing values do we have?
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print('Percent of data missing: ',str(percent_missing), '%')

329
Percent of data missing:  0.18019695691704368 %


Although percent of data missing is insignificant and could be ignored, these values may be able to obtained through web scraping

### Web Scraper to Impute Missing Values

In [6]:
# web scraping from metacritic
# 'Year' and 'Publisher' values can be scraped from metacritic

In [16]:
remove = string.punctuation
remove = remove.replace("-", "").replace("&","") 
pattern = r"[{}]".format(remove)

In [8]:
# check if web scraper can connect to metacritic site

game = 'resident-evil-village'
mc_url = 'https://www.metacritic.com/game/{}/'.format(game)
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36)'}

response = requests.get(mc_url, headers=headers)

if response.status_code == 200:
    print('Successfully connected to Metacritic')
else:
    print('Could not connect')

Successfully connected to Metacritic


In [17]:
missing_info_list = []


for index, row in df.iterrows():
    try:
        if pd.isna(row.Year) or pd.isna(row.Publisher) or row.Publisher == 'Unknown':
            
            game = re.sub(pattern, "", str(row.Name)).replace(" ", "-").replace("---", "-").replace("&","and").lower()
            print(row.Name)
        
            mc_url = 'https://www.metacritic.com/game/{}/'.format(game)
            print(mc_url)
            
            response = requests.get(mc_url, headers=headers)
        
            soup = BeautifulSoup(response.text, 'html.parser')
        
        
        if pd.isna(row.Year):
            df.at[index,'Year']=int((soup.find('div', {'class': "c-gameDetails_ReleaseDate u-flexbox u-flexbox-row"})).text[-4:])

        if pd.isna(row.Publisher) or row.Publisher == 'Unknown':
            df.at[index,'Publisher']=(soup.find('div', {'class': "c-gameDetails_Distributor u-flexbox u-flexbox-row"})).text[11:]

    
    except AttributeError as err1:
        missing_info_list.append(str(row.Name))

Adventure
https://www.metacritic.com/game/adventure/
Shrek / Shrek 2 2-in-1 Gameboy Advance Video
https://www.metacritic.com/game/shrek--shrek-2-2-in-1-gameboy-advance-video/
Donkey Kong Land III
https://www.metacritic.com/game/donkey-kong-land-iii/
Air-Sea Battle
https://www.metacritic.com/game/air-sea-battle/
PES 2009: Pro Evolution Soccer
https://www.metacritic.com/game/pes-2009-pro-evolution-soccer/
RIFT
https://www.metacritic.com/game/rift/
Nicktoons Collection: Game Boy Advance Video Volume 1
https://www.metacritic.com/game/nicktoons-collection-game-boy-advance-video-volume-1/
SpongeBob SquarePants: Game Boy Advance Video Volume 1
https://www.metacritic.com/game/spongebob-squarepants-game-boy-advance-video-volume-1/
Monster Hunter 2
https://www.metacritic.com/game/monster-hunter-2/
SpongeBob SquarePants: Game Boy Advance Video Volume 2
https://www.metacritic.com/game/spongebob-squarepants-game-boy-advance-video-volume-2/
Fishing Derby
https://www.metacritic.com/game/fishing-derby

In [21]:
# how many missing values do we have after imputing?
total_cells = np.product(df.shape)
total_missing = len(missing_info_list)

# percent of data that is still missing
percent_missing = (total_missing/total_cells) * 100
print('Percent of data missing: ',str(percent_missing), '%')

Percent of data missing:  0.04765086702669544 %
