### 0.1 Import libraries

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

### 0.2 Load data

In [None]:
data = pd.read_csv('dataset/train.csv')
# data = data.drop('id', 1)

# split into train and test
train, test = train_test_split(data, test_size=0.2)

## 1. Data preparation:
* Exploration (EDA) & Visualizaiton
* Cleaning
* Wrangling & Feature Engineering

### 1.1 Exploration (EDA) & Visualizaiton

#### General info about data

In [None]:
train.head()

In [None]:
len(train), len(test)

In [None]:
data.info()

In [None]:
train.describe()

#### Data operations

In [None]:
binary_variables = ['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']

categorical_variables = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city',
                         'neighbourhood', 'amenities']

numeric_variables = ['log_price', 'accommodates', 'bathrooms', 'host_response_rate', 'latitude', 'longitude',
                     'number_of_reviews', 'review_scores_rating', 'bedrooms', 'beds']  

column_to_drop = ['id', 'name', 'thumbnail_url', 'zipcode', 'description', 'first_review',
                  'host_since', 'last_review']

columns = {'binary_variables': binary_variables, 'categorical_variables': categorical_variables,
           'numeric_variables': numeric_variables, 'column_to_drop': column_to_drop, 'binned_variables': []}

#### Histograms

In [None]:
#initialize a Matplotlib figures grid
fig, axes = plt.subplots(5, 2,figsize=(24,24))

#generate a histogram using Pandas, for each numeric variable
# TODO bin host_response_rate
for i in range(5):
    for j in range(2):
        var = numeric_variables[i*2+j]
        train[var].hist(ax=axes[i,j])
        axes[i,j].set_title(var)


In [None]:
#generate a figures grid:
fig, axes = plt.subplots(4,2,figsize=(16,16))
fig.subplots_adjust(hspace=0.8)

#we will create a histogram for each categorical attribute
n=len(categorical_variables)
num_rows = 2
max_bars = 8

for i,variable in enumerate(categorical_variables):
    #calculate the current place on the grid
    r=int(i/num_rows)
    c=i%num_rows
    
    #create the "value counts" for the first <max_bars> categories:
    u=min(train[variable].nunique(),max_bars)
    vc = train[variable].value_counts()[:u]
    
    # plot a bar chart using Pandas
    vc.plot(kind='bar',ax=axes[r,c],title=variable)

In [None]:
#generate a figures grid:
fig, axes = plt.subplots(2,2,figsize=(16,16))
fig.subplots_adjust(hspace=0.5)

#we will create a histogram for each categorical attribute
n=len(binary_variables)
num_rows = 2
max_bars = 8

for i,variable in enumerate(binary_variables):
    #calculate the current place on the grid
    r=int(i/num_rows)
    c=i%num_rows
    
    #create the "value counts" for the first <max_bars> categories:
    u=min(train[variable].nunique(),max_bars)
    vc = train[variable].value_counts()[:u]
    
    # plot a bar chart using Pandas
    vc.plot(kind='bar',ax=axes[r,c],title=variable)

### Reviews

In [None]:
# Reviews may affect price, (e.g. positive reviews may add 'prestige')
years_of_last_review = pd.DataFrame({
    'year of last review':pd.to_datetime(train['last_review'], format='%Y-%m-%d', errors='coerce').dt.year.fillna(0),
    'log_price': train['log_price']
})

plt.figure(figsize=(12,4))
sns.countplot(x="year of last review", data=years_of_last_review)
plt.title('Row count')
plt.show()

# plt.figure(figsize=(12,8))
# sns.boxplot(data=years_of_last_review,orient='v', x = 'year of last review', y = 'log_price')
# plt.title('Years of last review and price')
# plt.show()

##### Correlations

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train.corr(method='spearman'), annot=True)

### Nan value counts

In [None]:
# Count nan values
len(train) - train.count()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train.isnull(),yticklabels=False,cbar=False)

#### Number of unique values

In [None]:
for var in binary_variables:
    print(var, len(train[var].unique()))

for var in categorical_variables:
    print(var, len(train[var].unique()))

for var in numeric_variables:
    print(var, len(train[var].unique()))

### Plots

In [None]:
def plotGrid(data, columnFilter, plotFunc, rows, columns):
    data = data.copy()
    data['host_response_rate'] = data['host_response_rate'].str.rstrip('%').astype('float') / 100.0

    fig, axes = plt.subplots(rows, columns,figsize=(columns * 8,rows * 8))
    fig.subplots_adjust(hspace=0.5)

    for i, var in enumerate(columnFilter):
        r=int(i/columns)
        c=i%columns
        
        plotFunc(data[var], ax=axes[r,c])
        axes[r,c].set_title(var)

# A kernel density estimate (KDE) plot is a method for visualizing the distribution of observations in
# a dataset, analagous to a histogram. KDE represents the data using a continuous probability density
# curve in one or more dimensions.
plotGrid(train, numeric_variables, sns.kdeplot, 2, 5)

### Price by classes

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 24))
fig.subplots_adjust(hspace=0.5)
catVals = categorical_variables.copy()

# Dont show graphs for those classes since they aren't good before data cleaning
if 'amenities' in catVals:
    catVals.remove('amenities')
if 'neighbourhood' in catVals:
    catVals.remove('neighbourhood')

# convert log_price to price
train['price'] = list(map(lambda x: np.e ** x,train.log_price))
for i, var in enumerate(catVals):
    r = i % 3
    c = int(i / 3)
    train.groupby(var)['price'].mean().plot.bar(ax=axes[r,c], title='Mean price by ' + var)

del train['price']

In [None]:
fig,ax=plt.subplots()

number_of_reviews_low = train[train['number_of_reviews'] <=50]
number_of_reviews_high = train[train['number_of_reviews'] > 50]

sns.distplot(number_of_reviews_low.log_price, bins = 25, kde = True, label = "#reviews<=50",ax=ax)
sns.distplot(number_of_reviews_high.log_price, bins = 25, kde = True, label = "#reviews>50",ax=ax)

#Don't forget to make titles for the figure and axes
plt.title('Class-wise Histogram of log price')
plt.xlabel('log_price')
plt.ylabel('Count')
plt.legend(loc="upper right")

In [None]:
fig,ax=plt.subplots()

reviewScoreLow = train[train['review_scores_rating'] <= 50]
reviewScoreHigh = train[train['review_scores_rating'] > 50]

sns.distplot(reviewScoreLow.log_price, bins = 25, kde = True, label = "review score <=50",ax=ax)
sns.distplot(reviewScoreHigh.log_price, bins = 25, kde = True, label = "review score >50",ax=ax)

#Don't forget to make titles for the figure and axes
plt.title('Class-wise Histogram of log price')
plt.xlabel('log_price')
plt.ylabel('Count')
plt.legend(loc="upper right")

### Other correlations

In [None]:
fig,ax=plt.subplots()

reviewScoreLow = train[train['instant_bookable'] == 't']
reviewScoreHigh = train[train['instant_bookable'] == 'f']

sns.distplot(reviewScoreLow.log_price, bins = 25, kde = True, label = "Instant bookable",ax=ax)
sns.distplot(reviewScoreHigh.log_price, bins = 25, kde = True, label = "Not instant bookable",ax=ax)

#Don't forget to make titles for the figure and axes
plt.title('Class-wise Histogram of log price')
plt.xlabel('log_price')
plt.ylabel('Count')
plt.legend(loc="upper right")

In [None]:
fig,ax=plt.subplots()

reviewScoreLow = train[train['host_identity_verified'] == 't']
reviewScoreHigh = train[train['host_identity_verified'] == 'f']

sns.distplot(reviewScoreLow.log_price, bins = 25, kde = True, label = "Host identity verified",ax=ax)
sns.distplot(reviewScoreHigh.log_price, bins = 25, kde = True, label = "Host identity not verified",ax=ax)

#Don't forget to make titles for the figure and axes
plt.title('Class-wise Histogram of log price')
plt.xlabel('log_price')
plt.ylabel('Count')
plt.legend(loc="upper right")

### QQ Plots

In [None]:
# If the data values fall along a roughly straight line at a 45-degree angle,
# then the data is normally distributed. 
fig = sm.qqplot(train['log_price'], line='45')
plt.show()

### 1.2 Cleaning

#### Drop Unnecessary Column


In [None]:
for column in columns['column_to_drop']:
    train = train.drop(column, axis=1)
    test = test.drop(column, axis=1)

#### Type Conversion


Boolean Variables hendling



In [None]:
for column in columns['binary_variables']:
    train[column] = train[column].apply(lambda x: True if x == 'TRUE' or x == 't' else False)
    test[column] = test[column].apply(lambda x: True if x == 'TRUE' or x == 't' else False)


Numeric Variables to Float


In [None]:
train['host_response_rate'] = train['host_response_rate'].str.rstrip('%').astype('float') / 100.0
test['host_response_rate'] = test['host_response_rate'].str.rstrip('%').astype('float') / 100.0

In [None]:
for column in columns['numeric_variables']:
    train[column] = train[column].astype(float)
    test[column] = test[column].astype(float)

#### Impute Missing Values

In [None]:
null_columns = []
for column in columns['binary_variables'] + columns['categorical_variables'] + columns['numeric_variables']:
    if train[column].isnull().sum():
        null_columns.append(column)
    if test[column].isnull().sum():
        null_columns.append(column)

In [None]:
train_with_most_frequent_values = train.copy()
test_with_most_frequent_values = test.copy()
for column in null_columns:
    val = train_with_most_frequent_values[column].value_counts().index[0]
    train_with_most_frequent_values[column] = train_with_most_frequent_values[column].fillna(val)
    test_with_most_frequent_values[column] = test_with_most_frequent_values[column].fillna(val)

train = train_with_most_frequent_values
test = test_with_most_frequent_values

#### Reset Indexes


In [None]:
train = train.reset_index()
test = test.reset_index()

### 1.3 Wrangling & Feature Engineering


#### amenities

In [None]:
def collect_amenities(data, columns_dict):
    chars_to_remove = '"{}'
    amenities_set = set()
    for line in data['amenities']:
        amenities = line.split(',')
        for obj in amenities:
            for char in chars_to_remove:
                obj = obj.replace(char, "")
            if "translation missing" not in obj and obj != '':
                amenities_set.add(obj)
    for amenity in amenities_set:
        columns_dict['binary_variables'].append(amenity)
    return list(amenities_set)


def create_amenities_array(amenities_list, data):
    amenities_array = []
    for index, row in data.iterrows():
        array = np.zeros(shape=(len(amenities_list)))
        row_amen = data['amenities'][index].split(',')
        for amen in row_amen:
            item = amen.replace('"', '').replace('}', '').replace('{', '')
            if item in amenities_list:
                res = amenities_list.index(item)
                array[res] = 1
        amenities_array.append(array.tolist())

    amenities_df = pd.DataFrame(amenities_array, columns=amenities_list)
    return amenities_df


# converting amenities column to binary columns and updating columns_dict
def create_amenities_cols(data, amenities_set):
    amenities_array = create_amenities_array(amenities_set, data)

    data = data.drop(['amenities'], axis=1)
    data = pd.concat([data, amenities_array], axis=1)

    return data

In [None]:
amenities_list = collect_amenities(train, columns)
train = create_amenities_cols(train, amenities_list)
test = create_amenities_cols(test, amenities_list)
columns['categorical_variables'].remove('amenities')

#### neighbourhood

In [None]:
top_neighbourhoods = train['neighbourhood'].value_counts().head(50).keys()
for index, row in train.iterrows():
    if row['neighbourhood'] not in top_neighbourhoods:
        train.at[index,'neighbourhood'] = 'other'
for index, row in test.iterrows():
    if row['neighbourhood'] not in top_neighbourhoods:
        test.at[index,'neighbourhood'] = 'other'

#### one hot encoding gor catagorial varaiables

In [None]:
# we leave the target feature as is
oh_train = train['log_price']
oh_test = test['log_price']

# now adding the one hot encoded data
for variable in columns['binned_variables']+columns['categorical_variables']:
    onehot_train_col = pd.get_dummies(train[variable], prefix=variable)
    oh_train = pd.concat([oh_train, onehot_train_col], axis=1)

    onehot_test_col = pd.get_dummies(test[variable], prefix=variable)
    oh_test = pd.concat([oh_test, onehot_test_col], axis=1)

#### concatenating binary varaiables 

In [None]:
for col in columns['binary_variables']:
    train[col] = train[col].replace(True, 1)
    train[col] =train[col].replace(False, 0)
    oh_train = pd.concat([oh_train, train[col]], axis=1)

    test[col] = test[col].replace(True, 1)
    test[col] = test[col].replace(False, 0)
    oh_test = pd.concat([oh_test, test[col]], axis=1)

#### concatenating numeric varaiables

In [None]:
for col in columns['numeric_variables']:
    if col != 'log_price':
        oh_train = pd.concat([oh_train, train[col]], axis=1)

        oh_test = pd.concat([oh_test, test[col]], axis=1)

#### equalize cloumns

In [None]:
add_to_test = list(set(oh_train.columns) - set(oh_test.columns))
add_to_train = list(set(oh_test.columns) - set(oh_train.columns))
for col in add_to_train:
    oh_train[col] = 0
for col in add_to_test:
    oh_test[col] = 0

oh_test = oh_test[oh_train.columns]

## 2. Train Model: Logistic regression

In [None]:
linear_regression = linear_model.LinearRegression()
# separate labels from data=
train_class = oh_train['log_price']
oh_train_data = oh_train.drop('log_price', axis=1)
# train the model:
linear_regression.fit(oh_train_data, train_class)
print(linear_regression)

## 3. Evaluate results

In [None]:
def plot_evaluation(model, train_x, train_y, test_x, test_y, prediction_test):
    #show_metrics(prediction_test, prediction_train, test_y, train_y)
    r2 = r2_score(test_y, prediction_test)
    print('r2 score:', r2)

    print("Mean Absolute Perc Error (Σ(|y-pred|/y)/n):",
          "{:,.3f}".format(mean_absolute_percentage_error(test_y, prediction_test)))
    print("Mean Absolute Error (Σ|y-pred|/n):", "{:,.3f}".format(mean_absolute_error(test_y, prediction_test)))
    print("Root Mean Squared Error (sqrt(Σ(y-pred)^2/n)):", "{:,.3f}".
          format(np.sqrt(mean_squared_error(test_y, prediction_test))))

    ## residuals
    residuals = test_y - prediction_test
    max_error = max(prediction_test) if abs(max(residuals)) > abs(min(residuals)) else min(residuals)
    max_idx = list(residuals).index(max(residuals)) if abs(max(residuals)) > abs(min(residuals)) else list(
        residuals).index(min(residuals))
    max_true, max_pred = test_y[max_idx], prediction_test[max_idx]
    print("Max Error:", "{:,.0f}".format(max_error))

    # fig, ax = plt.subplots(figsize=(8, 5))
    sns.scatterplot(x=prediction_test, y=test_y)
    sns.lineplot(x=prediction_test, y=prediction_test, color='black')
    plt.title('true values against the predicted values')
    plt.show()

    fig, ax = plt.subplots(1, 2, figsize=(16, 5))
    sns.scatterplot(x=prediction_test, y=residuals, ax=ax[0])
    sns.lineplot(x=prediction_test, y=0, ax=ax[0], color='black')
    ax[0].set_title("Residuals (Abs)")
    sns.scatterplot(x=prediction_test, y=residuals / test_y, ax=ax[1])
    sns.lineplot(x=prediction_test, y=0, ax=ax[1], color='black')
    ax[1].set_title("Residuals (%)")
    plt.show()

    rel_res = residuals / test_y

    rel_res = abs(rel_res)
    print(len(rel_res[rel_res < 0.05]) / len(rel_res))
    print(len(rel_res[rel_res > 0.2]) / len(rel_res))

    model_analysis(model, train_x)
    # todo - shap
    # shap_cal(model, train_x, test_x, test_y, rel_res, prediction_test, residuals)

    # https://www.kaggle.com/code/mohamedmokhtar7/airbnb-eda-and-regression#kln-346
    sns.regplot(x=test_y, y=prediction_test, fit_reg=False)
    plt.title('Prediction and real')
    plt.show()

    sns.distplot(test_y - prediction_test, bins=50)
    plt.title('Error variance')
    plt.show()

def model_analysis(model, train_x):
    print("Model coefficients:\n")
    for i in range(len(train_x.columns)):
        print(train_x.columns[i], "=", model.coef_[i].round(4))

In [None]:
train_y = oh_train['log_price'].values
train_x = oh_train.drop('log_price', axis=1)
test_y = oh_test['log_price'].values
test_x = oh_test.drop('log_price', axis=1)

prediction_test = linear_regression.predict(test_x)
prediction_train = linear_regression.predict(train_x)

plot_evaluation(
    model=linear_regression,
    train_x=train_x,
    train_y=train_y,
    test_x=test_x,
    test_y=test_y,
    prediction_test=prediction_test)