In [61]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

cwd = os.getcwd()

### Načítanie training datasetu

In [82]:
# nastavenie vizualizácie full text length
pd.set_option('display.max_colwidth', -1)

# separátor stĺpcov sa používa čiarka (pre tab to je sep='\t')
# pre moje demo požívam len prvých 100000 záznamov (nrows=100000). Pre načítanie všetkych riadkov stačí tento parameter vynechať
df = pd.read_csv(cwd + '/Datasets/Housing/melb_data.csv',sep=',', error_bad_lines=False, nrows=100000)
# zobrazenie prvých 3 riadkov 
print(df.head(3))

print(df.shape)

(13580, 21)


  pd.set_option('display.max_colwidth', -1)


### Príprava datasetu

In [45]:
# pomenovanie stĺpcov z datasetu
col = ['Rooms', 'Distance', 'Price']
# pridelenie stĺpcov z dataframeu
#df = df[col]
# úprava datasetu (vymazanie riadkov s prázdnym textom)
#df = df[pd.notnull(df['Consumer complaint narrative'])]
# definovanie konkrtétnych stĺpcov z dataframeu
#df.columns = ['Rooms', 'Distance', 'Price']

print(df.head())

y = df.Price
df.drop(['Price'], axis=1, inplace=True)
X = df.select_dtypes(exclude=['object'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size=0.20)

print(X_train.head())

       Suburb           Address  Rooms Type      Price Method SellerG  \
0  Abbotsford  85 Turner St      2      h    1480000.0  S      Biggin   
1  Abbotsford  25 Bloomburg St   2      h    1035000.0  S      Biggin   
2  Abbotsford  5 Charles St      3      h    1465000.0  SP     Biggin   
3  Abbotsford  40 Federation La  3      h    850000.0   PI     Biggin   
4  Abbotsford  55a Park St       4      h    1600000.0  VB     Nelson   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/12/2016  2.5       3067.0    ...  1.0       1.0  202.0    NaN             
1  4/02/2016  2.5       3067.0    ...  1.0       0.0  156.0     79.0           
2  4/03/2017  2.5       3067.0    ...  2.0       0.0  134.0     150.0          
3  4/03/2017  2.5       3067.0    ...  2.0       1.0  94.0     NaN             
4  4/06/2016  2.5       3067.0    ...  1.0       2.0  120.0     142.0          

   YearBuilt  CouncilArea Lattitude  Longtitude             Regionname  \
0 NaN 

### Sumarizácia chýbajúcich parametrov po stĺpcoch

In [47]:
# Počet chýbajúcich dát s NULL príznakom
missing_val_count_by_column = (X_train.isnull().sum())
# vizualizácia počtov a stĺpcov
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Car             49  
BuildingArea    5156
YearBuilt       4307
dtype: int64


### Definovanie funkcie pre výpočet chybovosti (MAE)

In [50]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Vymazanie riadkov

In [76]:
# vytvorenie zoznamu stĺpcov s chýbajúcimi dátami
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]
# pridelenie chýbajúcich training dát
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
# pridelenie chýbajúcich testovacích dát
reduced_X_test = X_test.drop(cols_with_missing, axis=1)

### Kontrola MAE pre vymazanie riadkov

In [60]:
print("MAE skóre pre vymazanie riadkov: ", str(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test)))

MAE skóre pre vymazanie riadkov:  175703.48185157913


### Imputation

In [77]:
# inicializácia funkcie SimpleImputer
imputer = SimpleImputer()
# transformácia, fit a priradenie training dát)
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
# transformácia a pridelenie testovacích dát
imputed_X_test = pd.DataFrame(imputer.transform(X_test))

# pridelenie získaných dát do stĺcpov datasetu
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

### Kontrola MAE pre Imputation

In [68]:
print("MAE skóre pre imputation: ", str(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)))

MAE skóre pre imputation:  169237.0268668034


### Rozšírená imputácia

In [71]:
# inicializácia funkcie SimpleImputer so stratégiou median
imputer = SimpleImputer(strategy='median')
# transformácia, fit a priradenie training dát)
ext_imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
# transformácia a pridelenie testovacích dát
ext_imputed_X_test = pd.DataFrame(imputer.transform(X_test))

# pridelenie získaných dát do stĺcpov datasetu
ext_imputed_X_train.columns = X_train.columns
ext_imputed_X_test.columns = X_test.columns

# inicializácia modelu
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(ext_imputed_X_train, y_train)

# predikcia
preds_valid = model.predict(ext_imputed_X_test)

### Kontrola MAE pre rozšírenú imputáciu

In [74]:
print("MAE skóre pre rozšírenú imputáciu): ",str(mean_absolute_error(y_test, preds_valid)))

MAE skóre pre rozšírenú imputáciu):  169749.0436038642
