In [1]:
from typing import Union, List

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils

sns.set(style="darkgrid")
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('train.csv', index_col='Id').sample(frac=1)  # sample(frac=1) -> randomize values
target_column = 'SalePrice'
df.shape

(1460, 80)

In [3]:
# на графиках были видны силдьные выбросы у target_column
df = utils.delete_abroad_elements(df, target_column)
# т.к. SalePrice -> skewed data =>
print('SalePrice norm distribution is skewed', not utils.get_skewed_columns(df[[target_column]]).empty)
# НЕ ЗАБЫВТЬ произвести np.log1p к SalePrice, для преобразования в нормальное распределение

SalePrice norm distribution is skewed True


In [4]:
categorical_columns = [
    'MSZoning', 'LotShape', 'BldgType', 'HouseStyle', 'MasVnrType', 'ExterQual', 'Foundation',
    'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'GarageType',
    'GarageFinish', 'YrSold', 'SaleType',
]

numeric_columns = [
    'MSSubClass', 'LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1',
    'BsmtUnfSF', 'TotalBsmtSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 
    'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
    'GarageCars', 'GarageArea', 'MoSold',
]

In [5]:
from sklearn.model_selection import train_test_split

X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
    df[numeric_columns+categorical_columns], df[target_column], test_size=0.15,
)

In [6]:
# найдем колонки у которых среди значений есть выбросы
columns_with_abroad_elements = utils.get_columns_with_count_abroad_elements(X_train_df[numeric_columns])
print('количество выбросов: %s' % columns_with_abroad_elements)
# удалим выбросы
train_df = pd.concat([X_train_df, y_train_df], axis=1)
train_df = utils.delete_abroad_elements(train_df, list(columns_with_abroad_elements.keys()))
X_train_df, y_train_df = train_df[X_train_df.columns], train_df[target_column]

количество выбросов: {'LotArea': 8, 'BsmtFinSF1': 2, 'BsmtUnfSF': 1, 'TotalBsmtSF': 3, 'GrLivArea': 3, 'BsmtFullBath': 1, 'BedroomAbvGr': 1, 'TotRmsAbvGrd': 1, 'GarageArea': 3}


In [7]:
# # удалим корреляционные элементы

# delete_columns = {'YrSold'}
# variable_values = [v for v in variable_values if v not in delete_columns]
# categorical_values = [v for v in categorical_values if v not in delete_columns]

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # т.к. пропущенных данных не так много, заполним их медианой
    ('scaler', StandardScaler()),
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])
skewed_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # т.к. пропущенных данных не так много, заполним их медианой
    ('skewer', utils.Log1Transformer()),
    ('scaler', StandardScaler()),
])

# отдельно будем обрабатывать перекошенные по распределнию данные
skewed_columns = list(utils.get_skewed_columns(X_train_df[numeric_columns]))
numeric_columns = [c for c in numeric_columns if c not in skewed_columns]

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, skewed_columns),
    ('categorical', categorical_transformer, categorical_columns),
    # преобразуем колонки со скошенными распределениями через log1p
    ('skewed', skewed_transformer, skewed_columns),
])
preprocessor.fit(X_train_df);

In [9]:
log1_transformer = utils.Log1Transformer()
X_train, y_train = preprocessor.transform(X_train_df), log1_transformer.transform(y_train_df)
X_test, y_test = preprocessor.transform(X_test_df), log1_transformer.transform(y_test_df)

### Пообучаем и найдем наилучшую модель

По заданию должны использовать root-mean-square-error

In [10]:
from sklearn.metrics import make_scorer, mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


scoring_rmse = make_scorer(rmse)


def print_rmse_for_model(model):
    """
    Напечатаем rmse к нормированной SalePrice и skewed
    """
    predicted_train = model.predict(X_train)
    predicted_test = model.predict(X_test)

    print('log Train RMSE', rmse(y_train, predicted_train))
    print('log Test RMSE', rmse(y_test, predicted_test))
    
    print('Train RMSE', rmse(y_train_df, log1_transformer.re_transform(predicted_train)))
    print('Train RMSE', rmse(y_test_df, log1_transformer.re_transform(predicted_test)))

In [11]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

print_rmse_for_model(regressor)
# print('Train RMSE', rmse(y_train, regressor.predict(X_train)))
# print('Test RMSE', rmse(y_test, regressor.predict(X_test)))

log Train RMSE 0.1687248517733512
log Test RMSE 0.1779041857907157
Train RMSE 30982.480926573742
Train RMSE 32525.840349995247


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

tree_params = {
    'max_depth': [5, 10, 13, 15], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}
tree_grid = GridSearchCV(DecisionTreeRegressor(random_state=17), tree_params, n_jobs=-1, cv=3, verbose=1)
tree_grid.fit(X_train, y_train)

print_rmse_for_model(tree_grid)
print('Best params', tree_grid.best_params_)
print('Best scores', tree_grid.best_score_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


log Train RMSE 0.14389909226125952
log Test RMSE 0.2600561866040972
Train RMSE 25256.635523703317
Train RMSE 45626.74636808797
Best params {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best scores 0.6142093804786899


[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:    1.9s finished


In [13]:
from sklearn.ensemble import RandomForestRegressor

randoms_trees_params = {
    'n_estimators': [10, 100],
    'max_features': [2, 15, 20],
    'max_depth': [10, 15, 20], 
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 5],
}
random_tree_grid = GridSearchCV(RandomForestRegressor(random_state=17), randoms_trees_params, 
                                n_jobs=-1, cv=3, verbose=1)
random_tree_grid.fit(X_train, y_train)

print('Best params', random_tree_grid.best_params_)
print('Best scores', random_tree_grid.best_score_)
print_rmse_for_model(random_tree_grid)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   15.6s finished


Best params {'max_depth': 15, 'max_features': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best scores 0.76964185824589
log Train RMSE 0.07463230529033126
log Test RMSE 0.1879651255358773
Train RMSE 13997.036604489698
Train RMSE 35391.53975520931


In [14]:
from xgboost import XGBRegressor

xgb_params = {
    'min_child_weight': [3, 4],
    'n_estimators': [50, 100, 300],
    'learning_rate': [0.1, 0.01],
    'gamma': [0.1, 0, 1],
    'max_depth': [3, 5],
}
xgb_grid = GridSearchCV(XGBRegressor(random_state=17), xgb_params, 
                        cv=3, verbose=1, n_jobs=-1)
xgb_grid.fit(X_train, y_train)

print('Best params', xgb_grid.best_params_)
print('Best scores', xgb_grid.best_score_)
print_rmse_for_model(xgb_grid)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   28.3s


Best params {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100}
Best scores 0.7748619707226465
log Train RMSE 0.12050104861728715
log Test RMSE 0.1940652949180911
Train RMSE 22127.08295987925
Train RMSE 35092.881512650354


[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed:   43.0s finished


In [15]:
# from catboost import CatBoostRegressor

# count_columns = X_train.shape[1]
# cat_features = list(range(len([*numeric_columns, *skewed_columns])+1, count_columns))

# # cat.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False, use_best_model=True)

# cat_params = {
#     'depth': [4, 6, 8],
#     'learning_rate' : [0.01, 0.03, 0.1],
# }
# cat = CatBoostRegressor(iterations=700, cat_features=cat_features, random_seed=17)
# cat_grid = GridSearchCV(cat, cat_params, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
# print(type(X_train.toarray()), type(y_train))
# # X_train is scipy.sparse.csr.csr_matrix
# cat_grid.fit(X_train, y_train, verbose=0)

# print('Best params', cat_grid.best_params_)
# print('Best scores', cat_grid.best_score_)
# print_rmse_for_model(cat_grid)

In [16]:
# from keras.models import Sequential
# from keras.layers import Dense, Dropout

# count_columns = X_train.shape[1]

# keras_seq = Sequential()
# keras_seq.add(Dense(100, input_dim=count_columns, activation='relu'))
# keras_seq.add(Dropout(0.4))
# keras_seq.add(Dense(60, activation='relu'))
# keras_seq.add(Dropout(0.4))
# keras_seq.add(Dense(1))
# keras_seq.compile(loss='mean_squared_error', optimizer='adam')

# keras_seq.fit(X_train, y_train, epochs=100, batch_size=3, verbose=0)

# print_rmse_for_model(keras_seq)

### Отдадим kagglе-у ответ

In [22]:
test_df = pd.read_csv('test.csv', index_col='Id')
X_test = test_df[numeric_columns+skewed_columns+categorical_columns]
X_test = preprocessor.transform(X_test)

# BAD TODODOODODOD

def write_answer(model, file_name):
    predicted_test = model.predict(X_test)
    test_df['SalePrice'] = log1_transformer.re_transform(predicted_test)
    test_df.to_csv(file_name, columns=['SalePrice'], index_label='Id')
    
# write_answer(keras_seq, 'keras_answer.csv')
write_answer(xgb_grid, 'xgb_answer.csv')
write_answer(random_tree_grid, 'random_tree_answer.csv')

In [18]:
!head keras_answer.csv -n 5

Id,SalePrice
1461,22655.904
1462,25319.768
1463,29063.393
1464,31260.012


In [19]:
!head xgb_answer.csv -n 5

Id,SalePrice
1461,123126.6
1462,159749.08
1463,184490.95
1464,186443.83


In [20]:
!head random_tree_answer.csv -n 5

Id,SalePrice
1461,135041.1955175371
1462,178075.66364487834
1463,190699.6935160959
1464,192987.44268104836


In [21]:
!head answer.csv -n 5

Id,SalePrice
1461,127339.375
1462,164522.94
1463,184828.14
1464,182903.52
