In [132]:
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np

### Constants and lambdas
Variables and lambdas used through the whole preprocessing.

In [133]:
columns_to_drop=["Other_Sales", "Critic_Count", "User_Count", "Rating", "Developer", "Publisher"]
needed_samples_lambda = lambda x: round((30./x)+1)
range_for_samples_lambda = lambda x: x*0.04 if x*0.04>0.01 else 0.01

def standardize(column):
    print("Max: {} Min: {}".format(column.max(), column.min()))
    stddev = column.std()
    mean = column.mean()
    return column.apply(lambda x: (x-mean)/stddev)

### Data loading and preprocessing
Load data from excel file, drop unused columns, replace missing values indicators (like tbd) with NaNs,
convert column types from object to numeric if necessary.

In [134]:
data = pd.read_excel("../data/games_sales_2016_modified.xlsx")
data = data.drop(columns=columns_to_drop)
data = data.replace({'tbd': np.NaN})
data["Critic_Score"] = pd.to_numeric(data["Critic_Score"])
data["User_Score"] = pd.to_numeric(data["User_Score"])

### Handling missing values
There are no missing values in EU_Sales, JP_Sales, NA_Sales and Global_Sales columns.
Some columns should be filled manually, while others can be generated (for example some
critic and user scores).

Remove entries without release date or without name

In [135]:
data = data.drop(data[data["Year_of_Release"].isna()].index)
data = data.drop(data[data["Name"].isna()].index)

Remove games without Critic or User Score with global sales below 0.2m

In [136]:
data = data.drop(data[
    (data["Critic_Score"].isna() | data["User_Score"].isna()) &
    (data["Global_Sales"] < 0.2)
].index)

Generate missing User Score and Critic Score values.

In [137]:
data = data.reset_index()
columns_to_use = ["Global_Sales", "Year_of_Release", "User_Score", "Critic_Score", "EU_Sales", "NA_Sales", "JP_Sales"]
missing_values_generator_df = data[columns_to_use]
missing_values_handler = KNNImputer(n_neighbors=10)
data[columns_to_use] = pd.DataFrame(missing_values_handler.fit_transform(missing_values_generator_df), columns=columns_to_use)

Normalization & Standardization

In [138]:
data["Critic_Score"] = data["Critic_Score"] / 100.
data["User_Score"] = data["User_Score"] / 10.

# data["Global_Sales"] = standardize(data["Global_Sales"])
# data["EU_Sales"] = standardize(data["EU_Sales"])
# data["JP_Sales"] = standardize(data["JP_Sales"])
# data["NA_Sales"] = standardize(data["NA_Sales"])
print(data["Critic_Score"])

0        0.76
1        0.83
2        0.82
3        0.80
4        0.88
         ... 
10447    0.70
10448    0.90
10449    0.79
10450    0.81
10451    0.78
Name: Critic_Score, Length: 10452, dtype: float64


Save modified dataset to file

In [139]:
data.to_excel("../data/games_sales_2016_preprocessed.xlsx")

# Pytania:
- zapytac o standaryzacje
- zapytac o uzupelnianie brakujących critic score/user score i czy brac pod uwage jeszcze rok

# TODO:
 - macierz korelacji miedzy kolumnami liczbowymi - J
 - heatmapa (genre, platforma, sprzedaz) - J
 - klasteryzacja - A
 - poprobowac inne parametry do outliersow i dodac zapisywanie do pliku - A
 - analiza: ktore elementy to outliersy, powtorzyc rysowanie po odrzuceniu, inne pomysly - A