In [None]:
import matplotlib.pyplot as plt
import joblib
import sys
import plotly.express as px
import pandas as pd
from pandarallel import pandarallel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.tools import JsonHandler, concatenate_listings_datasets, return_cleaned_col_names, preprocess_text
from src.class_transformers import (
    GeographicTransformer,
    BathroomsTransformer,
    CreateVerificationsTransformer,
    AmenitiesTransformer,
    OfflineLocationFinder,
    PropertyTypeTransformer,
    HostLocationImputer,
    ScrapingDateTransformer,
    ColumnDropperTransformer
)
from src.function_transformers import (
    fun_tr_id_to_string,
    fun_tr_from_string_to_rate,
    fun_tr_transform_to_datetime,
    fun_tr_remove_dollar_sign,
)
from sklearn.utils import estimator_html_repr
from sklearn import set_config
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from textblob import TextBlob
pandarallel.initialize()
pd.options.display.float_format = "{:.0f}".format
handler = JsonHandler()
import pandas as pd
import numpy as np
import re
from feature_engine.datetime import DatetimeSubtraction
from feature_engine.creation import RelativeFeatures
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder, OrdinalEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, MaxAbsScaler, PolynomialFeatures, PowerTransformer, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, make_scorer, mean_absolute_percentage_error, median_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import QuantileRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
import sys
from sklearn.neural_network import MLPRegressor



In [None]:
df = pd.read_csv("data/city_data/step_by_step.csv")

## Feature separation

In [None]:
numerical_features = [
    'host_response_rate',
    'host_acceptance_rate',
    'price',
    'host_location',
    'host_listings_count',
    'host_total_listings_count',
    'latitude',
    'longitude',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'beds',
    'minimum_nights',
    'maximum_nights',
    'number_of_reviews',
    'review_scores_rating',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'reviews_per_month',
    'listing_city_pop',
]

In [None]:
binary_features = [
    'host_has_profile_pic',
    'host_identity_verified',
    "host_is_superhost",
    'email_verification',
    'phone_verification',
    'work_email_verification',
    'amenities_internet',
    'amenities_self-checkin',
    'amenities_host-greeting',
    'amenities_pool',
    'amenities_oven',
    'amenities_microwave',
    'amenities_garden',
    'amenities_streaming',
    'amenities_gym',
    'amenities_elevator',
    'amenities_heating',
    'amenities_air-conditioning',
    'amenities_workspace',
    'amenities_freezer',
    'amenities_first-aid-kit',
    'amenities_dishwasher',
    'amenities_long-term-stays',
    'amenities_pets-allowed',
    'amenities_bathtube',
    'amenities_bbq-grill',
    'amenities_lake-bay-view'
]

## Description elaboration

In [None]:
def preprocess_text(text):
    text = str(text).lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

df['description'] = df['description'].parallel_apply(preprocess_text)

In [None]:
df['description_word_count'] = df['description'].parallel_apply(lambda x: len(x.split()))

In [None]:
df['description_sentiment_polarity'] = df['description'].parallel_apply(lambda x: TextBlob(x).sentiment.polarity)
df['description_sentiment_subjectivity'] = df['description'].parallel_apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [None]:
description_feature = [
    "description_word_count",
    "description_sentiment_polarity",
    "description_sentiment_subjectivity"
]

other_features = list(
    set(df.columns.tolist()) - set(numerical_features) - set(binary_features) - set(description_feature))


### Create some features

In [None]:
df["beds_for_bedroom"] = np.where(df["bedrooms"] != 0, df["beds"] / df["bedrooms"], 0)

### Drop highly correlates

In [None]:
numerical_features = [
    "beds_for_bedroom",
    'host_response_rate',
    'host_acceptance_rate',
    'price',
    'host_location',
    'host_listings_count',
    'latitude',
    'longitude',
    'accommodates',
    'bathrooms',
    'minimum_nights',
    'maximum_nights',
    'number_of_reviews',
    'review_scores_rating',
    'review_scores_checkin',
    'review_scores_location',
    'review_scores_value',
    'reviews_per_month',
    'listing_city_pop',
]

In [None]:
df_corr = df.copy()
corr_full = df_corr[numerical_features].corr()

fig = px.imshow(corr_full,
                text_auto=True,
                aspect="auto",
                width=900,
                height=900
                )

fig.show()

In [None]:
other_features = list(set(df.columns.tolist()) - set(numerical_features) - set(binary_features) - set(description_feature))

## Managing Other Features

In [None]:
ohe_features = [
    "property_type",
    "df_city_location",
    "room_type",
    "host_response_time",
    "bathrooms_text"
]

In [None]:
df["scraping_date"] = max(df["last_review"])

In [None]:
date_features = [
    "host_since",
    "last_review",
    "first_review",
    "scraping_date"
]

In [None]:
host_id_feature = [
    "host_id"
]

In [None]:
other_features = list(set(other_features) - set(ohe_features) - set(date_features) - set(host_id_feature))

## Prediction

In [None]:
testing = df.drop(other_features, axis=1)

In [None]:
testing.replace({"f": 0, "t": 1}, inplace=True)

In [None]:
testing.reset_index(inplace=True)
testing.drop(["id"], inplace=True, axis=1)

In [None]:
testing.dropna(inplace=True)

In [None]:
testing = testing.loc[testing["price"] < 1000, :]

In [None]:
train_set, test_set = train_test_split(testing, test_size=0.2, random_state=42)

In [None]:
fig = px.histogram(train_set,
                   x = "price")
fig.show()

In [None]:
X_train = train_set.drop(["price"], axis=1)
X_test = test_set.drop(["price"], axis=1)

y_train = train_set["price"]
y_test = test_set["price"]

In [None]:
wizard_pipe = Pipeline(
    steps=[
        (
            "OHE_imputation",
            CategoricalImputer(
                imputation_method="frequent",
                variables=ohe_features,
                return_object=True,
                ignore_format=False,
            ),
        ),
        (
            "OHE_encoding",
            OneHotEncoder(
                top_categories=None,
                drop_last=True,
                drop_last_binary=True,
                ignore_format=False,
                variables=ohe_features,
            ),
        ),
        # Review Dates (RD)
        (
            "RD_engineering",
            DatetimeSubtraction(
                variables="last_review",
                reference="first_review",
                output_unit="D",
                drop_original=False,
                new_variables_names=["days_active_reviews"],
                missing_values="ignore",
            ),
        ),
        (
            "RD_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=["days_active_reviews"]
            ),
        ),
        # Host since (HS)
        (
            "HS_engineering",
            DatetimeSubtraction(
                variables=["scraping_date"],
                reference=["host_since"],
                output_unit="D",
                drop_original=False,
                new_variables_names=["host_since_days"],
                missing_values="ignore",
            ),
        ),
        (
            "HS_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=["host_since_days"]
            ),
        ),
        # Host ID (HID)
        (
            "HID_imputation",
            CategoricalImputer(
                imputation_method="missing",
                variables=host_id_feature,
                fill_value="MISSING",
            ),
        ),
        (
            "HID_encoding",
            CountFrequencyEncoder(
                encoding_method="count", missing_values="ignore", unseen="encode"
            ),
        ),
        (
            "Drop columns",
            ColumnDropperTransformer(
                columns=[
                    "last_review",
                    "first_review",
                    "scraping_date",
                    "host_since"
                ]
            )
        ),
        (
            "Standardize",
            StandardScaler(),
        ),
        (
            "Model",
            LinearSVR()
        ),
    ],
    verbose=True
)


In [None]:
param_grid = {
    'Model__epsilon': [0],
    'Model__tol': [1e-4],
    'Model__C': [1],
    'Model__loss': ["epsilon_insensitive"],
    'Model__fit_intercept': [True],
    'Model__verbose': [2],
    'Model__random_state': [874631],
    'Model__max_iter': [1000]
}

scoring = {
    'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
    'r2_score': make_scorer(r2_score, greater_is_better=True),
}

In [None]:
grid_pipeline = GridSearchCV(
    wizard_pipe,
    param_grid=param_grid,
    refit="mean_absolute_error",
    scoring=scoring,
    n_jobs=-1,
    pre_dispatch=4, # avoid jobs explosion
    cv=5,        # use default 5 fold cross val,
    verbose=4,
    return_train_score=False
    )

grid_pipeline.fit(X_train, y_train)

In [None]:
grid_pipeline.best_params_

In [None]:
best_pipe = grid_pipeline.best_estimator_
print(best_pipe)

In [None]:
best_pipe.fit(X_train, y_train)

In [None]:
pred = best_pipe.predict(X_test)
print(
    f"\nExplained variance score is {explained_variance_score(y_true=y_test, y_pred=pred)}",
    f"\nMean Absolute Error is {mean_absolute_error(y_true=y_test, y_pred=pred)}",
    f"\nMean Absolute Percentage error is {round(100*mean_absolute_percentage_error(y_true=y_test, y_pred=pred), 2)}%",
    f"\nMedian Absolute Error is {median_absolute_error(y_true=y_test, y_pred=pred)}",
    f"\nMean Squared Error is {mean_squared_error(y_true=y_test, y_pred=pred)}",
    f"\nR^2 Error is {r2_score(y_true=y_test, y_pred=pred)}",
)

In [None]:
import joblib
joblib.dump(best_pipe, 'pickle/LinearSVR.pkl', compress=True)