In [8]:
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np

### Data loading and preprocessing
Load data from excel file, drop unused columns, replace missing values indicators (like tbd) with NaNs,
convert column types from object to numeric if necessary.

In [9]:
columns_to_drop=["Other_Sales", "Critic_Count", "User_Count", "Rating", "Developer", "Publisher"]

data = pd.read_excel("../data/games_sales_2016_modified.xlsx", index_col=0)
data = data.drop(columns=columns_to_drop)
data = data.replace({'tbd': np.NaN})
data["Critic_Score"] = pd.to_numeric(data["Critic_Score"])
data["User_Score"] = pd.to_numeric(data["User_Score"])

### Handling missing values
There are no missing values in EU_Sales, JP_Sales, NA_Sales and Global_Sales columns.
Some columns should be filled manually, while others can be generated (for example some
critic and user scores).

Remove entries without release date or without name

In [10]:
data = data.drop(data[data["Year_of_Release"].isna()].index)
data = data.drop(data[data["Name"].isna()].index)

Remove games without Critic or User Score with global sales below 0.2m

In [11]:
data = data.drop(data[
    (data["Critic_Score"].isna() | data["User_Score"].isna()) &
    (data["Global_Sales"] < 0.2)
].index)

Generate missing User Score and Critic Score values.

In [12]:
data = data.reset_index(drop=True)
columns_to_use = ["Global_Sales", "Year_of_Release", "User_Score", "Critic_Score", "EU_Sales", "NA_Sales", "JP_Sales"]
missing_values_generator_df = data[columns_to_use]
missing_values_handler = KNNImputer(n_neighbors=10)
data[columns_to_use] = pd.DataFrame(missing_values_handler.fit_transform(missing_values_generator_df), columns=columns_to_use)
print(data.isna().sum())

Name               0
Platform           0
Year_of_Release    0
Genre              0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Global_Sales       0
Critic_Score       0
User_Score         0
dtype: int64


Save modified dataset to file

In [13]:
print(data)
data.to_excel("../data/games_sales_2016_preprocessed.xlsx")

                                            Name Platform  Year_of_Release  \
0                                     Wii Sports      Wii           2006.0   
1                              Super Mario Bros.      NES           1985.0   
2                                 Mario Kart Wii      Wii           2008.0   
3                              Wii Sports Resort      Wii           2009.0   
4                       Pokemon Red/Pokemon Blue       GB           1996.0   
...                                          ...      ...              ...   
10447  Greg Hastings' Tournament Paintball Max'd      PS2           2006.0   
10448                                    Deus Ex       PC           2000.0   
10449                  Monster Rancher Advance 2      GBA           2002.0   
10450                              Karnaaj Rally      GBA           2003.0   
10451                Wade Hixton's Counter Punch      GBA           2004.0   

              Genre  NA_Sales  EU_Sales  JP_Sales  Global_Sales

# Pytania:
- zapytac o standaryzacje
- zapytac o uzupelnianie brakujących critic score/user score i czy brac pod uwage jeszcze rok

# TODO:
 - macierz korelacji miedzy kolumnami liczbowymi - J
 - heatmapa (genre, platforma, sprzedaz) - J
 - klasteryzacja - A
 - poprobowac inne parametry do outliersow i dodac zapisywanie do pliku - A
 - analiza: ktore elementy to outliersy, powtorzyc rysowanie po odrzuceniu, inne pomysly - A