In [1]:
from typing import Union, List

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils

sns.set(style="darkgrid")
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('train.csv', index_col='Id').sample(frac=1)  # sample(frac=1) -> randomize values
target_column = 'SalePrice'
df.shape

(1460, 80)

In [None]:
def delete_abroad_elements(df, columns):

In [15]:
abroad_sales = utils.get_abroad_values(utils.series_to_float(df[target_column]))
df = df.drop(df[abroad_sales].index, axis=0)

abroad_sales = utils.get_abroad_values(utils.series_to_float(df[target_column]))
df[abroad_sales][target_column]

Id
1244    465000
Name: SalePrice, dtype: int64

In [None]:
top_quantile = df[target_column].quantile(0.99)
bottom_quantile = df[target_column].quantile(0.1)
df = df[df[target_column] < top_quantile]  # can try for improve model quality
df = df[df[target_column] > bottom_quantile]  # can try for improve model quality

In [None]:
categorical_values = [
    'MSZoning', 'LotShape', 'BldgType', 'HouseStyle', 'MasVnrType', 'ExterQual', 'Foundation',
    'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'GarageType',
    'GarageFinish', 'YrSold', 'SaleType',
]

variable_values = [
    'MSSubClass', 'LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1',
    'BsmtUnfSF', 'TotalBsmtSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 
    'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
    'GarageCars', 'GarageArea', 'MoSold',
]

base_df = df[variable_values + categorical_values + [target_column]]

In [None]:
# удалим корреляционные элементы

delete_columns = {'YrSold'}
variable_values = [v for v in variable_values if v not in delete_columns]
categorical_values = [v for v in categorical_values if v not in delete_columns]

In [None]:
float_area = utils.series_to_float(df['GrLivArea'])
is_abroad_area = utils.get_abroad_values(float_area)
# print(df.loc[is_abroad_area, 'GrLivArea'])

df[is_abroad_area]['GrLivArea']
df[is_abroad_area].index
# a = df.drop(df[is_abroad_area].index, axis=0)
# float_area = series_to_float(a['GrLivArea'])
# is_abroad_area = get_abroad_values(float_area)
# any(is_abroad_area)

In [None]:
df['GrLivArea'].map(float).values.reshape(len(df), 1)

In [None]:
# стоит удалить выбросы
from sklearn.preprocessing import StandardScaler
area_values = np.matrix(df['GrLivArea'].map(float))
scaled_area = StandardScaler().fit_transform(df[['GrLivArea']])[:, 0]
tresh_hold = 4
outer_areas_indexes = ((scaled_area > tresh_hold) | (scaled_area < -1 * tresh_hold))
print(df.loc[outer_areas_indexes, 'GrLivArea'])


In [None]:
# # df[['GrLivArea']].apply(lambda values: list(map(float, values)))
a = df['GrLivArea'].map(float)
np.matrix(a)
# StandardScaler?

In [None]:
df[['GrLivArea']].apply

In [None]:
df.loc?

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df[variable_values+categorical_values], df[target_column], test_size=0.15,
)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, variable_values),
    ('categorical', categorical_transformer, categorical_values),
])
preprocessor.fit(X_train);

In [None]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

### Пообучаем и найдем наилучшую модель

По заданию должны использовать root-mean-square-error

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


scoring_rmse = make_scorer(rmse)

In [None]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)
print('Train RMSE', rmse(y_train, regressor.predict(X_train)))
print('Test RMSE', rmse(y_test, regressor.predict(X_test)))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

tree_params = {
    'max_depth': [5, 10, 13, 15], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}
tree_grid = GridSearchCV(DecisionTreeRegressor(random_state=17), tree_params, n_jobs=-1, cv=3, verbose=1)
tree_grid.fit(X_train, y_train)

print('Train RMSE', rmse(y_train, tree_grid.predict(X_train)))
print('Test RMSE', rmse(y_test, tree_grid.predict(X_test)))
print('Best params', tree_grid.best_params_)
print('Best scores', tree_grid.best_score_)

In [None]:
from sklearn.ensemble import RandomForestRegressor

randoms_trees_params = {
    'n_estimators': [10, 100],
    'max_features': [2, 15, 20],
    'max_depth': [10, 15, 20], 
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 5],
}
random_tree_grid = GridSearchCV(RandomForestRegressor(random_state=17), randoms_trees_params, 
                                n_jobs=-1, cv=3, verbose=1)
random_tree_grid.fit(X_train, y_train)

print('Train RMSE', rmse(y_train, random_tree_grid.predict(X_train)))
print('Test RMSE', rmse(y_test, random_tree_grid.predict(X_test)))
print('Best params', random_tree_grid.best_params_)
print('Best scores', random_tree_grid.best_score_)

In [None]:
from xgboost import XGBRegressor

xgb_params = {
    'min_child_weight': [3, 4],
    'n_estimators': [100, 150, 500, 1000],
    'learning_rate': [0.1, 0.01],
    'gamma': [0.1, 0, 1],
    'max_depth': [3, 5],
}
xgb_grid = GridSearchCV(XGBRegressor(random_state=17), xgb_params, early_stopping_rounds=5,
                        cv=3, verbose=1, n_jobs=-1)
xgb_grid.fit(X_train, y_train)

print('Train RMSE', rmse(y_train, xgb_grid.predict(X_train)))
print('Test RMSE', rmse(y_test, xgb_grid.predict(X_test)))
print('Best params', xgb_grid.best_params_)
print('Best scores', xgb_grid.best_score_)

In [None]:
from catboost import CatBoostRegressor

count_columns = X_train.shape[1]
cat_features = list(range(len(variable_values)+1, count_columns))

# cat.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False, use_best_model=True)

cat_params = {
    'depth': [4, 6, 8],
    'learning_rate' : [0.01, 0.03, 0.1],
}
cat = CatBoostRegressor(iterations=700, cat_features=cat_features, random_seed=17)
cat_grid = GridSearchCV(cat, cat_params, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
cat_grid.fit(X_train, y_train, verbose=0)

print('Train RMSE', rmse(y_train, cat_grid.predict(X_train)))
# print('Test RMSE', rmse(y_test, cat_grid.predict(X_test)))
print('Best params', cat_grid.best_params_)
print('Best scores', cat_grid.best_score_)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

count_columns = X_train.shape[1]

keras_seq = Sequential()
keras_seq.add(Dense(50, input_dim=count_columns, kernel_initializer='normal', activation='relu'))
keras_seq.add(Dropout(0.4))
keras_seq.add(Dense(60, kernel_initializer='normal', activation='relu'))
keras_seq.add(Dropout(0.4))
keras_seq.add(Dense(1, kernel_initializer='normal'))
keras_seq.compile(loss='mean_squared_error', optimizer='adam')

keras_seq.fit(X_train, y_train, epochs=100, batch_size=5, verbose=0)

print('Train RMSE', rmse(y_train, keras_seq.predict(X_train)))
print('Test RMSE', rmse(y_test, keras_seq.predict(X_test)))

### Отдадим kagglе-у ответ

In [None]:
test_df = pd.read_csv('test.csv', index_col='Id')
X_test = test_df[variable_values+categorical_values]
X_test = preprocessor.transform(X_test)

# predicted_test = random_tree_grid.predict(X_test)
predicted_test = xgb_grid.predict(X_test)
test_df['SalePrice'] = predicted_test
test_df.to_csv('answer.csv', columns=['SalePrice'], index_label='Id')

In [None]:
!head answer.csv -n 5