#### Feature Engineering  

Dataset: 
- _videogames_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-05-29

# Feature engineering – Videogames Dataset

## __1. Libraries__

In [2]:
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from which the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:

    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

from functools import partial
from IPython.display import display, HTML
import numpy as np
import os
import pandas as pd

## __2. Path to Data file__

In [3]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"
df_vg = load_dataset_from_csv(data_file_path, "videogames_clean.csv", sep=',', header='infer')

In [4]:
# Format notebook output
format_notebook()

## 3 __Functions__

In [5]:
# Function for calculating ...

## 4 __Casting to data types__

In [6]:
# Call casting dtypes function from features.py
df_vg = cast_datatypes(df_vg)

In [7]:
# Adjust sales values for better visualization: original values (in millions) are scaled up by 1000
# to convert them to thousands, improving readability in graphs.
sales_columns = ['na_sales', 'eu_sales', 'jp_sales', 'other_sales']
df_vg[sales_columns] = df_vg[sales_columns] * 1000

## 4. Feature Engineering

### 4.1 Datasets

#### 4.1.1 DataSet videogames

In [8]:
df_vg

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
0,wii_sports,wii,2006.0,sports,41360.0,28960.0,3770.0,8450.0,76.0,8.00,E
1,super_mario_bros,nes,1985.0,platform,29080.0,3580.0,6810.0,770.0,,,
2,mario_kart_wii,wii,2008.0,racing,15680.0,12760.0,3790.0,3290.0,82.0,8.30,E
3,wii_sports_resort,wii,2009.0,sports,15610.0,10930.0,3280.0,2950.0,80.0,8.00,E
4,pokemon_red_pokemon_blue,gb,1996.0,role_playing,11270.0,8890.0,10220.0,1000.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
16710,samurai_warriors_sanada_maru,ps3,2016.0,action,0.0,0.0,10.0,0.0,73.0,6.95,M
16711,lma_manager_2007,x360,2006.0,sports,0.0,10.0,0.0,0.0,72.0,7.30,E
16712,haitaka_no_psychedelica,psv,2016.0,adventure,0.0,0.0,10.0,0.0,72.0,7.80,M
16713,spirits_spells,gba,2003.0,platform,10.0,0.0,0.0,0.0,69.0,7.85,E


In [9]:
df_vg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   name             16715 non-null  string  
 1   platform         16715 non-null  category
 2   year_of_release  16713 non-null  float64 
 3   genre            16715 non-null  category
 4   na_sales         16715 non-null  float64 
 5   eu_sales         16715 non-null  float64 
 6   jp_sales         16715 non-null  float64 
 7   other_sales      16715 non-null  float64 
 8   critic_score     15360 non-null  float32 
 9   user_score       15185 non-null  float32 
 10  rating           15390 non-null  object  
dtypes: category(2), float32(2), float64(5), object(1), string(1)
memory usage: 1.1+ MB


In [10]:
# Sales variation per platform
df_vg_platform_sales = df_vg.loc[:, ['platform', 'na_sales', 'eu_sales', 'jp_sales', 'other_sales']]
df_vg_platform_sales = df_vg.groupby(['platform'], observed=False).agg({'na_sales':'sum', 'eu_sales':'sum', 'jp_sales':'sum', 
                                                                        'other_sales':'sum'}).reset_index()
df_vg_platform_sales

Unnamed: 0,platform,na_sales,eu_sales,jp_sales,other_sales
0,2600,90600.0,5470.0,0.0,910.0
1,3do,0.0,0.0,100.0,0.0
2,3ds,83490.0,61480.0,100670.0,13360.0
3,dc,5430.0,1690.0,8560.0,270.0
4,ds,382400.0,188890.0,175570.0,59260.0
...,...,...,...,...,...
26,wiiu,38100.0,25130.0,13010.0,5950.0
27,ws,0.0,0.0,1420.0,0.0
28,x360,602470.0,270760.0,12430.0,85760.0
29,xb,186690.0,60950.0,1380.0,8720.0


In [11]:
df_vg_platform_sales['total_sales'] = df_vg_platform_sales[['na_sales', 'eu_sales', 'jp_sales', 'other_sales']].sum(axis=1)
df_vg_platform_sales

Unnamed: 0,platform,na_sales,eu_sales,jp_sales,other_sales,total_sales
0,2600,90600.0,5470.0,0.0,910.0,96980.0
1,3do,0.0,0.0,100.0,0.0,100.0
2,3ds,83490.0,61480.0,100670.0,13360.0,259000.0
3,dc,5430.0,1690.0,8560.0,270.0,15950.0
4,ds,382400.0,188890.0,175570.0,59260.0,806120.0
...,...,...,...,...,...,...
26,wiiu,38100.0,25130.0,13010.0,5950.0,82190.0
27,ws,0.0,0.0,1420.0,0.0,1420.0
28,x360,602470.0,270760.0,12430.0,85760.0,971420.0
29,xb,186690.0,60950.0,1380.0,8720.0,257740.0


In [12]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "vg_platform_sales_feature.csv"

df_vg_platform_sales.to_csv(processed_path, index=False)

In [13]:
# Platforms highest total sales per year
df_vg_platform_top_sales_year = df_vg.loc[:, ['platform', 'year_of_release', 'na_sales', 'eu_sales', 'jp_sales', 'other_sales']]
df_vg_platform_top_sales_year = df_vg.groupby(['platform', 'year_of_release'], observed=False).agg({'na_sales':'sum', 'eu_sales':'sum', 
                                                                                                    'jp_sales':'sum', 'other_sales':'sum'}).reset_index()

cols = ['na_sales', 'eu_sales', 'jp_sales', 'other_sales']

df_vg_platform_top_sales_year = df_vg_platform_top_sales_year.loc[~(df_vg_platform_top_sales_year[cols] == 0).any(axis=1)]
df_vg_platform_top_sales_year

Unnamed: 0,platform,year_of_release,na_sales,eu_sales,jp_sales,other_sales
124,3ds,2011.0,27640.0,18840.0,12830.0,4450.0
125,3ds,2012.0,17110.0,11680.0,20030.0,2590.0
126,3ds,2013.0,15560.0,14840.0,23570.0,2600.0
127,3ds,2014.0,12330.0,8730.0,20690.0,2010.0
128,3ds,2015.0,6170.0,5060.0,15520.0,1030.0
...,...,...,...,...,...,...
1333,xb,2005.0,35110.0,12560.0,20.0,1740.0
1386,xone,2013.0,11850.0,5340.0,20.0,1750.0
1387,xone,2014.0,30970.0,18150.0,140.0,4810.0
1388,xone,2015.0,36030.0,18470.0,170.0,5470.0


In [14]:
df_vg_platform_top_sales_year['total_sales'] = df_vg_platform_top_sales_year[['na_sales', 'eu_sales', 'jp_sales', 'other_sales']].sum(axis=1)
df_vg_platform_top_sales_year

Unnamed: 0,platform,year_of_release,na_sales,eu_sales,jp_sales,other_sales,total_sales
124,3ds,2011.0,27640.0,18840.0,12830.0,4450.0,63760.0
125,3ds,2012.0,17110.0,11680.0,20030.0,2590.0,51410.0
126,3ds,2013.0,15560.0,14840.0,23570.0,2600.0,56570.0
127,3ds,2014.0,12330.0,8730.0,20690.0,2010.0,43760.0
128,3ds,2015.0,6170.0,5060.0,15520.0,1030.0,27780.0
...,...,...,...,...,...,...,...
1333,xb,2005.0,35110.0,12560.0,20.0,1740.0,49430.0
1386,xone,2013.0,11850.0,5340.0,20.0,1750.0,18960.0
1387,xone,2014.0,30970.0,18150.0,140.0,4810.0,54070.0
1388,xone,2015.0,36030.0,18470.0,170.0,5470.0,60140.0


In [15]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "vg_platform_top_sales_year_feature.csv"

df_vg_platform_top_sales_year.to_csv(processed_path, index=False)

In [16]:
df_vg.columns

Index(['name', 'platform', 'year_of_release', 'genre', 'na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'critic_score', 'user_score', 'rating'], dtype='object')

In [None]:
# Platforms that used to be popular but are now losing sales
df_popular_loss = df_vg