In [1]:
import pandas as pd
import tensorflow
import sklearn

In [28]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [29]:
base = pd.read_csv('games.csv')
base

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,,


In [30]:
# Remove a coluna 'Other_Sales' do DataFrame 'base'.
# 'axis=1' indica que estamos removendo colunas (em oposição a linhas, que seria axis=0).
base = base.drop('Other_Sales', axis=1)

# Remove a coluna 'Global_Sales' do DataFrame 'base'.
# Isso pode ser feito para evitar o uso de rótulos de saída como características de entrada no modelo.
base = base.drop('Global_Sales', axis=1)

# Remove a coluna 'Developer' do DataFrame 'base'.
# A exclusão dessa coluna pode ser uma decisão baseada em uma análise de relevância ou para reduzir a dimensionalidade.
base = base.drop('Developer', axis=1)


In [31]:
base.shape

(16719, 13)

In [32]:
base.isnull().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Rating             6769
dtype: int64

In [33]:
base = base.dropna(axis = 0)

In [34]:
base.shape

(6825, 13)

In [35]:
base.isnull().sum()

Name               0
Platform           0
Year_of_Release    0
Genre              0
Publisher          0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Critic_Score       0
Critic_Count       0
User_Score         0
User_Count         0
Rating             0
dtype: int64

In [36]:
base['Name'].value_counts()

Name
Need for Speed: Most Wanted                  8
Madden NFL 07                                8
LEGO Star Wars II: The Original Trilogy      8
The Sims 2                                   7
Terraria                                     7
                                            ..
Castlevania: Portrait of Ruin                1
Suzuki TT Superbikes                         1
Rumble Roses                                 1
Sherlock Holmes: The Mystery of the Mummy    1
STORM: Frontline Nation                      1
Name: count, Length: 4377, dtype: int64

In [37]:
base = base.drop('Name', axis = 1)

In [38]:
base.shape

(6825, 12)

In [39]:
base.columns

Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count', 'Rating'],
      dtype='object')

In [40]:
# Extrai um subconjunto de colunas do DataFrame 'base' utilizando a indexação iloc.
# O resultado é convertido em um array NumPy usando .values.
X = base.iloc[:, [0, 1, 2, 3, 7, 8, 9, 10, 11]].values


array([['Wii', 2006.0, 'Sports', ..., '8', 322.0, 'E'],
       ['Wii', 2008.0, 'Racing', ..., '8.3', 709.0, 'E'],
       ['Wii', 2009.0, 'Sports', ..., '8', 192.0, 'E'],
       ...,
       ['PC', 2014.0, 'Action', ..., '7.6', 412.0, 'M'],
       ['PC', 2011.0, 'Shooter', ..., '5.8', 43.0, 'T'],
       ['PC', 2011.0, 'Strategy', ..., '7.2', 13.0, 'E10+']], dtype=object)

In [41]:
# Extrai a coluna na posição 4 do DataFrame 'base' e a armazena no array 'y_na'.
y_na = base.iloc[:, 4].values

# Extrai a coluna na posição 5 do DataFrame 'base' e a armazena no array 'y_eu'.
y_eu = base.iloc[:, 5].values

# Extrai a coluna na posição 6 do DataFrame 'base' e a armazena no array 'y_jp'.
y_jp = base.iloc[:, 6].values


In [42]:
y_na

array([4.136e+01, 1.568e+01, 1.561e+01, ..., 0.000e+00, 1.000e-02,
       0.000e+00])

In [43]:
y_eu

array([2.896e+01, 1.276e+01, 1.093e+01, ..., 1.000e-02, 0.000e+00,
       1.000e-02])

In [44]:
y_jp

array([3.77, 3.79, 3.28, ..., 0.  , 0.  , 0.  ])

In [45]:
# PS2  1 0 0 0 0 ...
# X360 0 1 0 0 0 ...
base['Platform'].value_counts()

Platform
PS2     1140
X360     858
PS3      769
PC       651
XB       565
Wii      479
DS       464
PSP      390
GC       348
PS4      239
GBA      237
XOne     159
3DS      155
PS       150
PSV      118
WiiU      89
DC        14
Name: count, dtype: int64

In [46]:
base.columns

Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count', 'Rating'],
      dtype='object')

In [47]:
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0, 2, 3, 8])], remainder='passthrough')
X = onehotencoder.fit_transform(X).toarray()

In [48]:
X.shape

(6825, 303)

In [49]:
X[0]

array([0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 

In [50]:
(303 + 3) / 2

153.0

In [51]:
# Define a camada de entrada com um formato de 303 características.
camada_entrada = Input(shape=(303,))

# Define a primeira camada oculta com 153 unidades e função de ativação ReLU.
camada_oculta1 = Dense(units=153, activation='relu')(camada_entrada)

# Define a segunda camada oculta, também com 153 unidades e função de ativação ReLU.
camada_oculta2 = Dense(units=153, activation='relu')(camada_oculta1)

# Define a primeira saída, que tem uma unidade e usa a ativação linear.
camada_saida1 = Dense(units=1, activation='linear')(camada_oculta2)

# Define a segunda saída, também com uma unidade e ativação linear.
camada_saida2 = Dense(units=1, activation='linear')(camada_oculta2)

# Define a terceira saída, igualmente com uma unidade e ativação linear.
camada_saida3 = Dense(units=1, activation='linear')(camada_oculta2)


In [52]:
# Cria um modelo com a camada de entrada e as três saídas definidas anteriormente.
regressor = Model(inputs=camada_entrada, outputs=[camada_saida1, camada_saida2, camada_saida3])


In [53]:
# Compila o modelo 'regressor' utilizando o otimizador Adam e a função de perda de erro quadrático médio (MSE).
regressor.compile(optimizer='adam', loss='mse')


In [54]:
# Treina o modelo 'regressor' utilizando os dados de entrada X e as saídas y_na, y_eu e y_jp.
regressor.fit(
    X,                          # Dados de entrada com 303 características.
    [y_na, y_eu, y_jp],       # Lista de saídas correspondentes para cada previsão (vendas na América do Norte, Europa e Japão).
    epochs=500,                # Número de épocas para treinamento, indicando quantas vezes o modelo irá passar por todo o conjunto de dados.
    batch_size=100             # Tamanho do lote, ou seja, quantas amostras serão processadas antes de atualizar os pesos do modelo.
)


Epoch 1/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - dense_2_loss: 3376.7698 - dense_3_loss: 2962.0457 - dense_4_loss: 478.6888 - loss: 6818.1924
Epoch 2/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - dense_2_loss: 10.8267 - dense_3_loss: 4.7413 - dense_4_loss: 2.7201 - loss: 18.2908
Epoch 3/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - dense_2_loss: 2.8858 - dense_3_loss: 1.8030 - dense_4_loss: 1.1239 - loss: 5.8139
Epoch 4/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_2_loss: 2.1035 - dense_3_loss: 2.7349 - dense_4_loss: 0.8870 - loss: 5.7271
Epoch 5/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_2_loss: 2.1491 - dense_3_loss: 4.3010 - dense_4_loss: 4.5308 - loss: 10.9646
Epoch 6/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_2_loss: 3.7640 - dense_3_loss: 9.0927 - dense_4_

<keras.src.callbacks.history.History at 0x1b1498220c0>

In [55]:
previsao_na, previsao_eu, previsao_jp = regressor.predict(X)

[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 883us/step


In [56]:
previsao_na, previsao_na.mean()

(array([[ 5.2841196 ],
        [ 3.1556387 ],
        [ 5.795487  ],
        ...,
        [ 0.12970215],
        [-0.18780264],
        [-0.3072293 ]], dtype=float32),
 0.26723275)

In [57]:
y_na, y_na.mean()

(array([4.136e+01, 1.568e+01, 1.561e+01, ..., 0.000e+00, 1.000e-02,
        0.000e+00]),
 0.3944835164835165)

In [58]:
from sklearn.metrics import mean_absolute_error

In [59]:
mean_absolute_error(y_na, previsao_na)

0.27365705962329556

In [60]:
previsao_eu, previsao_eu.mean()

(array([[ 3.6501012 ],
        [ 2.0271475 ],
        [ 4.06038   ],
        ...,
        [ 0.1512219 ],
        [-0.08327144],
        [-0.15896392]], dtype=float32),
 0.16042905)

In [61]:
y_eu, y_eu.mean()

(array([2.896e+01, 1.276e+01, 1.093e+01, ..., 1.000e-02, 0.000e+00,
        1.000e-02]),
 0.23608937728937732)

In [62]:
mean_absolute_error(y_eu, previsao_eu)

0.18438145178517143

In [63]:
y_jp, y_jp.mean()

(array([3.77, 3.79, 3.28, ..., 0.  , 0.  , 0.  ]), 0.06415824175824175)

In [64]:
mean_absolute_error(y_jp, previsao_jp)

0.0794067589135397