# House price predict

In [1]:
import pandas as pd
import shap
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
import catboost as cat
import numpy as np
from scipy.stats import spearmanr, pearsonr
from sklearn.svm import OneClassSVM
from keybert import KeyBERT
from sklearn.manifold import TSNE
from catboost import CatBoostRegressor
from catboost import Pool, metrics, cv
from sklearn.impute import KNNImputer
from sklearn.cluster import DBSCAN
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tqdm import tqdm
%matplotlib inline

ModuleNotFoundError: ignored

## EDA

In [None]:
df = pd.read_csv("Train.csv")

In [None]:
desc = df['description']

In [None]:
df['amenities']

In [None]:
# Convert $ to numeric
df["price"] = df["price"].replace('[\$,]', '', regex=True).astype(float)
df["cleaning_fee"] = df["cleaning_fee"].replace('[\$,]', '', regex=True).astype(float)

In [None]:
# Convert % to numeric
df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').apply(pd.to_numeric, errors='coerce')
df['host_acceptance_rate'] = df['host_acceptance_rate'].str.rstrip('%').apply(pd.to_numeric, errors='coerce')

In [None]:
# Replacing similar values
df['city'] = df['city'].replace({'Washington, D.C.': 'Washington'})
df['city'] = df['city'].replace({'Washington ': 'Washington'})
df['state'] = df['state'].replace({'Washington DC': 'DC'})
df['smart_location'] = df['smart_location'].replace({'Washington, D.C., DC': 'Washington, DC',
                              'Washington , DC' : 'Washington, DC'})

In [None]:
profile = ProfileReport(df, title="Pandas Profiling Report")
profile

## Text processing

In [None]:
cat = df.description.str.cat(sep=' ')

In [None]:
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(cat, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=20)
keys = [i[0] for i in keywords]
keys.remove('bedrooms')

In [None]:
len(df.description)

In [None]:
keywords = kw_model.extract_keywords(df.description[110], highlight=True)
keywords

In [None]:
keywords = []
for i in tqdm(range(len(df.description))):
    keys = kw_model.extract_keywords(df.description[i], keyphrase_ngram_range=(1, 1), stop_words='english', top_n=3)
    keys = [i[0] for i in keys]
    keywords.append(keys)
    print(keywords)

In [None]:
keywords

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]


In [None]:
key_list = flatten(keywords)

In [None]:
len(key_list)

In [None]:
unique_keys = set(key_list)

In [None]:
len(unique_keys)

In [None]:
key_columns = unique_keys
key_rows = []
for i in desc:
    tmp_list = []
    for key in unique_keys:
        try:
            if key in i:
                tmp_list.append(1)
            else:
                tmp_list.append(0)
        except TypeError:
            continue
    key_rows.append(tmp_list)
key_categorical = pd.DataFrame(data=key_rows, columns = key_columns)



In [None]:
k = key_categorical.corrwith(df["price"], method='spearman').sort_values(ascending=False)


In [None]:
from pandas import DataFrame
from IPython.display import HTML
HTML(DataFrame(k).to_html())

In [None]:
key_categorical = key_categorical[['sofa', 'housekeeping', 'appliances', 'fireplace',
                                  'modern', 'luxurious', 'van', 'roof', 'ceilings',
                                  'shared', 'private', 'bus', 'metro', 'roommates', 'baths',
                                  'historic', 'balconies', 'grill', 'rooms', 'basement', 'cozy',
                                  'pool', 'patio']]

In [None]:
key_categorical.head(5)

In [None]:
key_categorical.isnull().sum()

In [None]:
df = pd.concat([df, key_categorical], axis=1)

In [None]:
# Drop single and text values
cat_list = ['name', 'summary', 'description', 'host_location',
    'host_about', 'host_neighbourhood', 'host_verifications',
    'neighbourhood_cleansed','market', 'country_code', 'country', 'host_since',
     'neighborhood_overview', 'transit', 'host_since']
df = df.drop(cat_list, axis=1)

In [None]:
plt.subplots(figsize=(45,20))
sns.heatmap(df.corr(),cmap='YlGnBu',annot=True, linewidth=.5)

In [None]:
df.hist(figsize=(20,20), xrot=-45)

In [None]:
sns.violinplot(data=df, x='price')


## Data proccecing

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df['amenities'] = df['amenities'].apply(
    lambda x: [] if x == '{}' else sorted(list(map(lambda x: x.replace('"','') , x[1:-1].split(',') ) )  ))

unique_amenities = []
for item in df['amenities']:
    for i in item:
        if i not in unique_amenities:
            unique_amenities.append(i)
unique_amenities = sorted(unique_amenities)

amenities_columns = unique_amenities
amenities_rows = []
for i in df['amenities']:
    tmp_list = []
    for amenity in unique_amenities:
        if amenity in i:
            tmp_list.append(1)
        else:
            tmp_list.append(0)
    amenities_rows.append(tmp_list)
amenities_categorical = pd.DataFrame(data=amenities_rows, columns = amenities_columns)
amenities_categorical = amenities_categorical.dropna()
df = pd.concat([df, amenities_categorical], axis=1)
df = df.drop('amenities', axis=1)


In [None]:
amenities_categorical

In [None]:
df = pd.get_dummies(df)

In [None]:
list_of_na = ['host_response_rate', 'host_acceptance_rate', 'review_scores_value',
              'reviews_per_month', 'cleaning_fee', 'bathrooms', 'bedrooms', 'beds',
              'bedrooms', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
              'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'reviews_per_month']


correlation = {}
for col in list_of_na:
    na_list = []
    for i in range(1,10):
        df_knn = df
        imputer = KNNImputer(n_neighbors=i, weights='uniform', metric='nan_euclidean')
        imputer.fit(df_knn)
        Xtrans = imputer.transform(df_knn)
        df_reduced = pd.DataFrame(Xtrans)
        df_reduced.columns = df_knn.columns
        df_knn = df_reduced
        for column in list_of_na:
            df_knn[column] = df_knn[column].apply(lambda x: int(round(x)))

        df_knn['bathrooms'] = df_knn['bathrooms'].apply(lambda x: round(x * 2) / 2)
        na_list.append(spearmanr(df_knn.price, df_knn[[col]]).correlation)
    correlation[col] = na_list


In [None]:
fig, axs = plt.subplots(4, 4, figsize=(15,15))
axs = axs.ravel()
for i in range(0,14):
    axs[i].plot(range(1,10),list(correlation.values())[i],color='red', linestyle='dashed', marker='x',
    markerfacecolor='green', markersize=7)
    axs[i].set_title(list(correlation.keys())[i])


In [None]:
impute = KNNImputer(n_neighbors=7)
df[['host_response_rate']] = impute.fit_transform(df[['host_response_rate']])
df[['host_acceptance_rate']] = impute.fit_transform(df[['host_acceptance_rate']])
df[['review_scores_value']] = impute.fit_transform(df[['review_scores_value']])
df[['reviews_per_month']] = impute.fit_transform(df[['reviews_per_month']])
df[['cleaning_fee']] = impute.fit_transform(df[['cleaning_fee']])
df[['bathrooms']] = impute.fit_transform(df[['bathrooms']])
df[['bedrooms']] = impute.fit_transform(df[['bedrooms']])
df[['beds']] = impute.fit_transform(df[['beds']])
df[['bedrooms']] = impute.fit_transform(df[['bedrooms']])
df[['review_scores_rating']] = impute.fit_transform(df[['review_scores_rating']])
df[['review_scores_accuracy']] = impute.fit_transform(df[['review_scores_accuracy']])
df[['review_scores_cleanliness']] = impute.fit_transform(df[['review_scores_cleanliness']])
df[['review_scores_checkin']] = impute.fit_transform(df[['review_scores_checkin']])
df[['review_scores_communication']] = impute.fit_transform(df[['review_scores_communication']])
df[['review_scores_location']] = impute.fit_transform(df[['review_scores_location']])
df[['reviews_per_month']] = impute.fit_transform(df[['reviews_per_month']])

In [None]:
df.isnull().sum()[0:50]

In [None]:
df = df.dropna()

In [None]:
df.info()

### Outliers detection

In [None]:
plt.scatter(range(df.shape[0]), np.log(np.sort(df['price'].values)))
plt.xlabel('index')
plt.ylabel('price')
plt.title("Price distribution")
sns.despine()

In [None]:
sns.distplot(np.log(df['price']))
plt.title("Distribution")
sns.despine()


In [None]:
print("Skewness: %f" % np.log(df['price'].skew()))
print("Kurtosis: %f" % np.log(df['price'].kurt()))

In [None]:
likely_cat = {}
for var in df.columns:
    likely_cat[var] = 1.*df[var].nunique()/df[var].count() < 0.05 #or some other threshold
likely_cat

In [None]:
categ = df.loc[:, df.columns != 'number_of_reviews']
categ = categ.loc[: ,categ.columns != 'reviews_per_month']
categ = categ.loc[: ,categ.columns != 'cleaning_fee']


In [None]:
print(categ.corrwith(df["price"], method='spearman').sort_values(ascending=False)[1:10])
print(categ.corrwith(df["price"], method='spearman').sort_values(ascending=False)[-10:-1])


In [None]:
df[['number_of_reviews', 'reviews_per_month', 'cleaning_fee']].corrwith(df["price"], method='pearson').sort_values(ascending=False)

In [None]:
isolation_forest = IsolationForest(n_estimators=100)
isolation_forest.fit(df['price'].values.reshape(-1, 1))
xx = np.linspace(df['price'].min(), df['price'].max(), len(df)).reshape(-1,1)
anomaly_score = isolation_forest.decision_function(xx)
outlier = isolation_forest.predict(xx)
plt.figure(figsize=(10,4))
plt.plot(xx, anomaly_score, label='anomaly score')
plt.fill_between(xx.T[0], np.min(anomaly_score), np.max(anomaly_score),
                 where=outlier==-1, color='r',
                 alpha=.4, label='outlier region')
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel('price')
plt.show()

In [None]:
sns.regplot(x="price", y="bedrooms", data=df)
sns.despine()


In [None]:
sns.regplot(x="price", y="accommodates", data=df)
sns.despine()

In [None]:
sns.regplot(x="price", y="bathrooms", data=df)
sns.despine()

In [None]:
sns.regplot(x="price", y="beds", data=df)
sns.despine()

In [None]:
sns.regplot(x="price", y="cleaning_fee", data=df)
sns.despine()

In [None]:
sns.regplot(x="price", y="guests_included", data=df)
sns.despine()

In [None]:
df.boxplot(column=['price'] ,rot=45)

In [None]:
df.boxplot(column=['cleaning_fee'] ,rot=45)

In [None]:
df.boxplot(column=['beds'] ,rot=45)

In [None]:
df.boxplot(column=['bathrooms'] ,rot=45)

In [None]:
df = df.dropna()

In [None]:
X = df.values
db = DBSCAN(eps=250, min_samples=9, metric='euclidean')
y_db = db.fit_predict(X)

In [None]:
outlier_index = np.where(y_db == -1)
outlier_values = df.iloc[outlier_index]
outlier_values.shape

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(df)

plt.scatter(x=X_embedded[:,0], y=X_embedded[:,1] , c=y_db, cmap="plasma")
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

In [None]:
model = OneClassSVM(kernel = 'rbf', gamma = 0.00001, nu = 0.0001).fit(X)
y_pred = model.predict(X)

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(df)

plt.scatter(x=X_embedded[:,0], y=X_embedded[:,1] , c=y_pred, cmap="plasma")
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

In [None]:
outlier_index = np.where(y_pred == -1)
outlier_values = df.iloc[outlier_index]
outlier_values.shape

In [None]:
df = df.loc[df['price']<300]
df = df.loc[df['bedrooms']<6]
df = df.loc[df['bathrooms']<5]
df = df.loc[df['beds']<8]
df = df.loc[df['guests_included']<12]
df = df.loc[df['cleaning_fee']<160]
df.shape

## Models training

In [None]:
df1 = df.iloc[:300]
X_val = df1.drop('price', axis=1)
y_val = df1['price']
df = df.iloc[300:]

In [None]:
X, y = df.iloc[:, df.columns != 'price'].values, \
       df.iloc[:, df.columns == 'price'].values

In [None]:
X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
          'l2_leaf_reg':[3,1,5,10,100]}


model = CatBoostRegressor()
grid = GridSearchCV(estimator=model, param_grid = params, cv = 2, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
cat = CatBoostRegressor(iterations=100,
                          learning_rate=0.001,
                          depth=10,
                          l2_leaf_reg=100)
cat.fit(X_train, y_train)

In [None]:
preds = cat.predict(X_test)
print(np.sqrt(mean_squared_error(preds, y_test)))
print(mean_absolute_error(preds, y_test))

In [None]:
train_pool = Pool(X_train, y_train)
validate_pool = Pool(X_test, y_test)

In [None]:
feature_importances = cat.get_feature_importance(train_pool)
feature_names = X_val.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))


In [None]:
explainer = shap.Explainer(cat.predict, X_test)
shap_values = explainer(X_test)

In [None]:
shap.summary_plot(shap_values, X_val, plot_type="bar")

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, np.log(y_train))

In [None]:
preds = np.exp(regr.predict(X_test))
print(np.sqrt(mean_squared_error(preds, y_test)))
print(mean_absolute_error(preds, y_test))

In [None]:
explainer = shap.TreeExplainer(regr)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_val, plot_type="bar")


In [None]:
params = {'max_depth':[3,1,2,6,4,5,7,8,9,10],
          'n_estimators':[250,100,500,1000],
          'eta':[0.03,0.001,0.01,0.1,0.2,0.3],
          'subsample':[0.1,0.3,0.5,100]}


model = XGBRegressor()
grid = GridSearchCV(estimator=model, param_grid = params, cv = 2, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xgb.fit(X_train, y_train)



In [None]:
preds = xgb.predict(X_test)
print(np.sqrt(mean_squared_error(preds, y_test)))
print(mean_absolute_error(preds, y_test))

In [None]:
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_val, plot_type="bar")

In [None]:
lgbm = LGBMRegressor(random_state=37)
lgbm.fit(X_train, y_train)

In [None]:
preds = lgbm.predict(X_test)
print(np.sqrt(mean_squared_error(preds, y_test)))
print(mean_absolute_error(preds, y_test))

In [None]:
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_val, plot_type="bar")

In [None]:
from sklearn.ensemble import StackingRegressor

ensemble_model = StackingRegressor([
    ("catboost", cat),
    ("random forest", regr),
    ('xgboost', xgb),
    ('lightgbm', lgbm)

], final_estimator=XGBRegressor())
ensemble_model.fit(X=X_train, y=y_train)
preds= ensemble_model.predict(X_test)
print(np.sqrt(mean_squared_error(preds, y_test)))
print(mean_absolute_error(preds, y_test))

In [None]:
preds = lgbm.predict(X_val)
print(np.sqrt(mean_squared_error(preds, y_val)))
print(mean_absolute_error(preds, y_val))

#### Results board

| Model         | MAE| RMSE| Description                              |
| ------------- | -- | --- | ---------------------------------------- |
| CatBoost      | 66 | 109 | with auto categorical features           |
| Catboost      | 77 | 150 | with minimal preprocessing               |
| Random Forest | 57 | 130 | with minimal preprocessing               |
| CatBoost      | 44 | 57  | with deleted outliers and LabelBinarizer |
| Random Forest | 33 | 44  | with deleted outliers and LabelBinarizer |
| CatBoost      | 44 | 57  | with deleted outliers and get dummies    |
| Random Forest | 32 | 44  | with deleted outliers and get dummies    |
| Random Forest | 31 | 44  | with deleted outliers and get dummies and text columns|
| XGBoost | 25 | 36| with deleted outliers and get dummies and text columns|
