In [80]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('dark_background')

In [81]:
df = pd.read_csv('data/house_price/train.csv', index_col=0)
val_df = pd.read_csv('data/house_price/test.csv', index_col=0)

# Data Processing

## Feature Engineering

In [82]:
def feature_engineering_log1p(df, col_name):
    df = df.copy()
    df[col_name] = df[col_name].fillna(df[col_name].median())
    df[col_name+"_log1p"] = np.log1p(df[col_name].fillna(df[col_name].median()))
    return df
df = feature_engineering_log1p(df, 'MasVnrArea')

In [83]:
def feature_engineering_fillna_random(df, col_name):
    df=df.copy()
    df[col_name] = df[col_name].to_frame().applymap(
        lambda l: 
        l if not np.isnan(l) 
        else np.random.choice(df[col_name].dropna()))
    return df

df = feature_engineering_fillna_random(df, "GarageYrBlt")

## Filling Na and Categorical One Hot Encoding

In [None]:
pd.get_dummies?

In [None]:
["MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Condition1", "Condition2", 
"Neighborhood", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual",
"ExterCond", "Foundation", "BsmtQual", "BsmCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC",
"CentralAir", "Electrical", "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond",
"PavedDrive", "PoolQC", "Fence", "MiscFeature", "SaleType", "SaleCondition"]

In [84]:

df.LotFrontage.fillna(df.LotFrontage.mean(), inplace=True)
df.Alley.fillna('Not precised', inplace=True)
df.BsmtQual.fillna('Not precised', inplace=True)
df.BsmtCond.fillna('Not precised', inplace=True)
df.BsmtExposure.fillna('Not precised', inplace=True)
df.PoolQC.fillna('None', inplace=True)
df.Fence.fillna('None', inplace=True)
df.MiscFeature.fillna('None', inplace=True)

df = pd.concat([df, pd.get_dummies(data=df.MSZoning, prefix='MSZoning')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Street, prefix='Street')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Alley, prefix='Alley')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.LotShape, prefix='LotShape')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.LandContour, prefix='LandContour')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Utilities, prefix='Utilities')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.LotConfig, prefix='LotConfig')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.LandSlope, prefix='LandSlope')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Condition1, prefix='Condition1')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Condition2, prefix='Condition2')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Neighborhood, prefix='Neighborhood')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.BldgType, prefix='BldgType')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.HouseStyle, prefix='HouseStyle')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.RoofStyle, prefix='RoofStyle')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.RoofMatl, prefix='RoofMatl')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Exterior1st, prefix='Exterior1st')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Exterior2nd, prefix='Exterior2nd')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.MasVnrType, prefix='MasVnrType')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.ExterQual, prefix='ExterQual')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.ExterCond, prefix='ExterCond')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Foundation, prefix='Foundation')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.BsmtQual, prefix='BsmtQual')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.BsmtCond, prefix='BsmtCond')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.BsmtExposure, prefix='BsmtExposure')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.BsmtFinType1, prefix='BsmtFinType1')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.BsmtFinType2, prefix='BsmtFinType2')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Heating, prefix='Heating')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.HeatingQC, prefix='HeatingQC')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.CentralAir, prefix='CentralAir')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Electrical, prefix='Electrical')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.KitchenQual, prefix='KitchenQual')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Functional, prefix='Functional')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.FireplaceQu, prefix='FireplaceQu')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.GarageType, prefix='GarageType')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.GarageFinish, prefix='GarageFinish')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.GarageQual, prefix='GarageQual')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.GarageCond, prefix='GarageCond')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.PavedDrive, prefix='PavedDrive')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.PoolQC, prefix='PoolQC')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.Fence, prefix='Fence')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.MiscFeature, prefix='MiscFeature')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.SaleType, prefix='SaleType')], axis=1)
df = pd.concat([df, pd.get_dummies(data=df.SaleCondition, prefix='SaleCondition')], axis=1)

df.drop(columns=['Street', 'MSZoning', 'Alley', 'LotShape', 'LandContour', 'Unique', 'Utilities', 'LotConfig',
                 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
                 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
                 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
                 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
                 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'
                 ], inplace=True, errors='ignore')

In [85]:
y = df.SalePrice
X = df.drop('SalePrice', axis=1)

In [86]:
VarianceThreshold?

[1;31mInit signature:[0m [0mVarianceThreshold[0m[1;33m([0m[0mthreshold[0m[1;33m=[0m[1;36m0.0[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the
desired outputs (y), and can thus be used for unsupervised learning.

Read more in the :ref:`User Guide <variance_threshold>`.

Parameters
----------
threshold : float, default=0
    Features with a training-set variance lower than this threshold will
    be removed. The default is to keep all features with non-zero variance,
    i.e. remove the features that have the same value in all samples.

Attributes
----------
variances_ : array, shape (n_features,)
    Variances of individual features.

n_features_in_ : int
    Number of features seen during :term:`fit`.

    .. versionadded:: 0.24

feature_names_in_ : ndarray of shape (`n_features_in_`,)
    Names of features seen during :term:`

In [90]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.01))
sel.fit(X)
X = X[X.columns[sel.get_support(indices=True)]]

In [91]:
df.isnull().values.any()

False

# Modeling

## Decision Tree Regressor

In [100]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor 
dec_tree = DecisionTreeRegressor()
param_space = {
    'min_samples_leaf': range(10, 35, 5), 
    'max_depth': range(2, 8)
}
gs = GridSearchCV(dec_tree, param_space, verbose=1, cv=5, return_train_score=True)
gs.fit(X, y)
results = pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score').iloc[1, 4:]                                                                         

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [101]:
results.mean_train_score, results.mean_test_score, results.params

(0.8280089378678722,
 0.7675846846678203,
 {'max_depth': 6, 'min_samples_leaf': 20})

## SVC

In [110]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

svr = SVR()
param_space = [
    {
        "kernel": ["linear"],
        "C": [1, 10, 100,],
    },
    {
        "kernel": ["rbf"],
        "C": [10, 100],
        "gamma": [0.001, 0.0001],
    },
]
gs = GridSearchCV(svr, param_space, verbose=1, cv=5, return_train_score=True)
gs.fit(X, y)
results = pd.DataFrame(gs.cv_results_).sort_values(by="rank_test_score").iloc[1, 4:]

Fitting 5 folds for each of 7 candidates, totalling 35 fits


KeyboardInterrupt: 

In [None]:
results.mean_train_score, results.mean_test_score, results.params

# Do Domu

- KNN
- Feature Engineering zrobić funkcje i w pętli przeiterować przez wszystkie kolumny

Do Przeczytania:
- CNN [Blog](https://towardsdatascience.com/a-comprehensive-guide-to-convolutional-neural-networks-the-eli5-way-3bd2b1164a53)
- DNN [Video](https://www.youtube.com/watch?v=aircAruvnKk)
- CNN [Video](https://www.youtube.com/watch?v=YRhxdVk_sIs)

# Następna lekcja

problem MNIST classification - ręcznie pisane cyfry 2 rozwiązania:

    - DNN - sieć gęsta
    - CNN - sieć splotowa (ang. convolutional neural network)