In [1]:
import pandas as pd
import numpy as np

<h2>Wczytywanie i obrabianie danych</h2>

In [2]:
df = pd.read_csv('train.csv', index_col=0)

In [3]:
# czyscimy dane zeby latwiej je bylo zencodowac
cols_to_encode = ['game_type', 'category', 'mechanic']
for col in cols_to_encode:
    df[col] = df[col].fillna('')

In [4]:
# robimy mapping, ktory pomoze przy encodowaniu
mapping = {}
cnt = 0
for col in cols_to_encode:
    vals = df[col].str.cat(sep='|').split('|')
    unique_vals = np.unique(vals) 
    for val in unique_vals:
        if val != '':
            mapping[col + '_' + val] = cnt
            cnt += 1

In [5]:
def make_encodings(df, mapping, cols_to_encode, cols_as_is):
    encoding_len = len(mapping)
    X = np.zeros((len(df), encoding_len))
    
    for i, row in df.iterrows():
        for col in cols_to_encode:
            for cat in row[col].split('|'):
                if cat != '':
                    X[i, mapping[col + '_' + cat]] = 1
        
    X = np.concatenate((X, df[cols_as_is].fillna(-1)), axis=1) # uznajemy ze brak wartosci to tez informacja i ja kodujemy za pomoca liczby -1
    return X

In [6]:
# encodujemy kolumny z kategoriami
cols_as_is = ['year', 'min_players', 'max_players', 'min_age', 'min_age_rec',
              'min_time', 'max_time']

y = df['num_sold'].to_numpy()
df = df.drop(columns=['num_sold', 'name'])
X = make_encodings(df, mapping, cols_to_encode, cols_as_is)

In [7]:
# dzielimy nasz dataset na zbior treningowy i walidacyjny
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=40)

<h2>Trenowanie modeli</h2>

In [8]:
# nasz baseline, ktory bedziemy probowali przebic
preds_baseline = np.full((len(y_test), ), y_train.mean())
baseline_mae = np.abs(y_test - preds_baseline).mean()

print('Baseline MAE:', baseline_mae)

Baseline MAE: 284.70762384448767


In [9]:
# trenujemy i testujemy KNN
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

preds_knn = knn.predict(X_test)
knn_mae = np.abs(y_test - preds_knn).mean()

print('KNN MAE:', knn_mae)

KNN MAE: 156.63615635179153


<h2>Zapis wyniku modelu</h2>

In [10]:
# wczytujemy i przetwarzamy nasze dane testowe tak samo jak robilismy to wczesniej z treningowymi
df_test = pd.read_csv('test.csv', index_col=0)

for col in cols_to_encode:
    df_test[col] = df_test[col].fillna('')
    
X = make_encodings(df_test, mapping, cols_to_encode, cols_as_is)

In [11]:
# przewidujemy wyniki za pomoca wczesniej wytrenowanego modelu
preds = knn.predict(X)

In [12]:
# zapisujemy wyniki do pliku, ktory potem wrzucimy na Kaggle'a
submission = pd.DataFrame({'id': range(len(preds)), 'predicted': preds})
submission.to_csv('sample_submission_2.csv', index=False)