Script usado para limpieza de datos, con el fin de cargar el dataset default en formato csv
sin datos nulos, duplicados, etc para la lista doblemente enlazada de c++

In [6]:
import pandas as pd
import re

In [7]:
#Cargar dataset
df: pd.DataFrame = pd.read_csv("origVGsales.csv", sep=",")

#Modificar la cantidad de elementos a mostrar en pantalla
pd.set_option("display.max.rows", df.shape[0])

In [8]:
#Buscar mediante expresiones regulares las columnas que tengan ventas
# para ser borradas ya que no nos interesan para nuestro proyecto  
toDelete: list[str] = [col for col in df.columns if re.match(r"\w*\_?sales", col, re.IGNORECASE)] 
print(f"These columns will be deleted: {toDelete}")
df.drop(columns=toDelete, inplace=True)

These columns will be deleted: ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']


In [9]:
#Buscar juegos con nombres duplicados para ser borrados ya que no son permitidos
duplicates: pd.DataFrame = df[df.duplicated(['Name'])]
df.drop_duplicates(['Name'], inplace=True)

#Borrar filas con campos null
df.dropna(inplace=True)

duplicates #Columnas duplicadas borradas

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher
16,41,Call of Duty: Black Ops,PS3,2010.0,Shooter,Activision
43,104,Battlefield 3,PS3,2011.0,Shooter,Electronic Arts
54,121,Call of Duty 4: Modern Warfare,PS3,2007.0,Shooter,Activision
65,135,Red Dead Redemption,X360,2010.0,Action,Take-Two Interactive
72,156,Tetris,NES,1988.0,Puzzle,Nintendo
109,695,Dead Island,PS3,2011.0,Action,Deep Silver


In [10]:
#Crear dos columnas para verificar el tamaño de los strings
df['NameSize'] = df['Name'].apply(lambda name:len(name))
df['PublisherSize'] = df['Publisher'].apply(lambda name:len(name))

#Rows con tamaño de string superior al tamaño de formato impuesto
notAllow = df[(df['NameSize'] > 32) | (df['PublisherSize'] > 32)]

#Borrar filas con exceso de tamaño
df.drop(index=notAllow.index, inplace=True)

notAllow #Columnas borradas

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NameSize,PublisherSize
37,95,The Legend of Zelda: Ocarina of Time,N64,1998.0,Action,Nintendo,36,8
38,96,Crash Bandicoot 2: Cortex Strikes Back,PS,1997.0,Platform,Sony Computer Entertainment,38,27
41,101,The Legend of Zelda: Twilight Princess,Wii,2006.0,Action,Nintendo,38,8
68,147,The Last of Us,PS3,2013.0,Action,Sony Computer Entertainment Europe,14,34
74,160,Batman: Arkham City,PS3,2011.0,Action,Warner Bros. Interactive Entertainment,19,38
84,250,Winning Eleven: Pro Evolution Soccer 2007,PS2,2006.0,Sports,Konami Digital Entertainment,41,28
97,407,Star Wars Episode III: Revenge of the Sith,PS2,2005.0,Action,LucasArts,42,9
103,509,Mortal Kombat,PS3,2011.0,Fighting,Warner Bros. Interactive Entertainment,13,38
105,554,Mortal Kombat X,PS4,2015.0,Fighting,Warner Bros. Interactive Entertainment,15,38
123,1124,Until Dawn,PS4,2015.0,Adventure,Sony Computer Entertainment Europe,10,34


In [11]:
#Ordenar el dataset para adecuarse al formato requerido, ademas de reinciar indices
df = df[['Name','Genre', 'Platform', 'Publisher', 'Year']].reset_index(drop=True)

#Columna nombres tiene tipo float asi que la convertimos a entero
df['Year'] = df['Year'].astype("int32")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       108 non-null    object
 1   Genre      108 non-null    object
 2   Platform   108 non-null    object
 3   Publisher  108 non-null    object
 4   Year       108 non-null    int32 
dtypes: int32(1), object(4)
memory usage: 3.9+ KB


In [12]:
df.to_csv("preloaded.csv", sep=";", index=False)

#Final dataset exported
df

Unnamed: 0,Name,Genre,Platform,Publisher,Year
0,Wii Sports,Sports,Wii,Nintendo,2006
1,Super Mario Bros.,Platform,NES,Nintendo,1985
2,Mario Kart Wii,Racing,Wii,Nintendo,2008
3,Tetris,Puzzle,GB,Nintendo,1989
4,Mario Kart DS,Racing,DS,Nintendo,2005
5,Grand Theft Auto V,Action,PS3,Take-Two Interactive,2013
6,Grand Theft Auto: San Andreas,Action,PS2,Take-Two Interactive,2004
7,Super Mario World,Platform,SNES,Nintendo,1990
8,Super Mario Bros. 3,Platform,NES,Nintendo,1988
9,Grand Theft Auto: Vice City,Action,PS2,Take-Two Interactive,2002
