# Where were we?


- Airbnb data - predicting asset's price.

- Last results:  
    Mean Absolute Error (Σ|y-pred|/n): 0.315  
    Mean Squared Error (Σ(|y-pred|/y)/n): 0.182  
    Root Mean Squared Error (sqrt(Σ(y-pred)^2/n)): 0.427  

# What we worked on since:

1. Handled previously ignored dates columns
2.
3.
4.

In [1]:
import datetime
import pandas as pd
import numpy as np
import sklearn
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [2]:
data = pd.read_csv('dataset/train.csv')

## 1 Handled previously ignored dates columns:

Although those columns are dates, they are not defining a time series data sets.   
They should be handled in their own specific way. 
We chose to convert each date cell into a numeric type that represents years from the given date till today.

In [3]:
# host_since
data.host_since = pd.to_datetime(data.host_since, format='%Y-%m-%d')
data['host_since_in_years'] = datetime.datetime.now() - data['host_since']
data['host_since_in_years'] = data['host_since_in_years'].apply(lambda x: x.days)
data['host_since_in_years'] = data['host_since_in_years'].apply(lambda x: x/365)
data.host_since_in_years = pd.to_numeric(data.host_since_in_years)
data = data.drop('host_since', axis=1)

# first_review
data.first_review = pd.to_datetime(data.first_review, format='%Y-%m-%d')
data['first_review_in_years'] = datetime.datetime.now() - data['first_review']
data['first_review_in_years'] = data['first_review_in_years'].apply(lambda x: x.days)
data['first_review_in_years'] = data['first_review_in_years'].apply(lambda x: x/365)
data.first_review_in_years = pd.to_numeric(data.first_review_in_years)
data = data.drop('first_review', axis=1)

# last_review
data.last_review = pd.to_datetime(data.last_review, format='%Y-%m-%d')
data['last_review_in_years'] = datetime.datetime.now() - data['last_review']
data['last_review_in_years'] = data['last_review_in_years'].apply(lambda x: x.days)
data['last_review_in_years'] = data['last_review_in_years'].apply(lambda x: x/365)
data.last_review_in_years = pd.to_numeric(data.last_review_in_years)
data = data.drop('last_review', axis=1)

In [4]:
train, test = train_test_split(data, test_size=0.2)

In [5]:
train[['log_price','host_since_in_years']].corr()

Unnamed: 0,log_price,host_since_in_years
log_price,1.0,0.078779
host_since_in_years,0.078779,1.0


In [6]:
train[['log_price','first_review_in_years']].corr()

Unnamed: 0,log_price,first_review_in_years
log_price,1.0,0.083299
first_review_in_years,0.083299,1.0


In [7]:
train[['log_price','last_review_in_years']].corr()

Unnamed: 0,log_price,last_review_in_years
log_price,1.0,0.018642
last_review_in_years,0.018642,1.0


In [8]:
binary_variables = ['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']

categorical_variables = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city',
                         'neighbourhood', 'amenities']

numeric_variables = ['log_price', 'accommodates', 'bathrooms', 'host_response_rate', 'latitude', 'longitude',
                     'number_of_reviews', 'review_scores_rating', 'bedrooms', 'beds', 'host_since_in_years', 'first_review_in_years', 'last_review_in_years']  

column_to_drop = ['id', 'name', 'thumbnail_url', 'zipcode', 'description']

columns = {'binary_variables': binary_variables, 'categorical_variables': categorical_variables,
           'numeric_variables': numeric_variables, 'column_to_drop': column_to_drop, 'binned_variables': []}

old code for running test prediction:

In [9]:
for column in columns['column_to_drop']:
    train = train.drop(column, axis=1)
    test = test.drop(column, axis=1)

In [10]:
for column in columns['binary_variables']:
    train[column] = train[column].apply(lambda x: True if x == 'TRUE' or x == 't' else False)
    test[column] = test[column].apply(lambda x: True if x == 'TRUE' or x == 't' else False)

In [11]:
train['host_response_rate'] = train['host_response_rate'].str.rstrip('%').astype('float') / 100.0
test['host_response_rate'] = test['host_response_rate'].str.rstrip('%').astype('float') / 100.0

In [12]:
for column in columns['numeric_variables']:
    train[column] = train[column].astype(float)
    test[column] = test[column].astype(float)

In [13]:
null_columns = []
for column in columns['binary_variables'] + columns['categorical_variables'] + columns['numeric_variables']:
    if train[column].isnull().sum():
        null_columns.append(column)
    if test[column].isnull().sum():
        null_columns.append(column)

In [14]:
train_with_most_frequent_values = train.copy()
test_with_most_frequent_values = test.copy()
for column in null_columns:
    val = train_with_most_frequent_values[column].value_counts().index[0]
    train_with_most_frequent_values[column] = train_with_most_frequent_values[column].fillna(val)
    test_with_most_frequent_values[column] = test_with_most_frequent_values[column].fillna(val)

train = train_with_most_frequent_values
test = test_with_most_frequent_values

In [15]:
train = train.reset_index()
test = test.reset_index()

In [16]:
def collect_amenities(data, columns_dict):
    chars_to_remove = '"{}'
    amenities_set = set()
    for line in data['amenities']:
        amenities = line.split(',')
        for obj in amenities:
            for char in chars_to_remove:
                obj = obj.replace(char, "")
            if "translation missing" not in obj and obj != '':
                amenities_set.add(obj)
    for amenity in amenities_set:
        columns_dict['binary_variables'].append(amenity)
    return list(amenities_set)


def create_amenities_array(amenities_list, data):
    amenities_array = []
    for index, row in data.iterrows():
        array = np.zeros(shape=(len(amenities_list)))
        row_amen = data['amenities'][index].split(',')
        for amen in row_amen:
            item = amen.replace('"', '').replace('}', '').replace('{', '')
            if item in amenities_list:
                res = amenities_list.index(item)
                array[res] = 1
        amenities_array.append(array.tolist())

    amenities_df = pd.DataFrame(amenities_array, columns=amenities_list)
    return amenities_df


# converting amenities column to binary columns and updating columns_dict
def create_amenities_cols(data, amenities_set):
    amenities_array = create_amenities_array(amenities_set, data)

    data = data.drop(['amenities'], axis=1)
    data = pd.concat([data, amenities_array], axis=1)

    return data

In [17]:
amenities_list = collect_amenities(train, columns)
train = create_amenities_cols(train, amenities_list)
test = create_amenities_cols(test, amenities_list)
columns['categorical_variables'].remove('amenities')

In [18]:
top_neighbourhoods = train['neighbourhood'].value_counts().head(50).keys()
for index, row in train.iterrows():
    if row['neighbourhood'] not in top_neighbourhoods:
        train.at[index,'neighbourhood'] = 'other'
for index, row in test.iterrows():
    if row['neighbourhood'] not in top_neighbourhoods:
        test.at[index,'neighbourhood'] = 'other'

In [19]:
# we leave the target feature as is
oh_train = train['log_price']
oh_test = test['log_price']

# now adding the one hot encoded data
for variable in columns['binned_variables']+columns['categorical_variables']:
    onehot_train_col = pd.get_dummies(train[variable], prefix=variable)
    oh_train = pd.concat([oh_train, onehot_train_col], axis=1)

    onehot_test_col = pd.get_dummies(test[variable], prefix=variable)
    oh_test = pd.concat([oh_test, onehot_test_col], axis=1)

In [20]:
for col in columns['binary_variables']:
    train[col] = train[col].replace(True, 1)
    train[col] =train[col].replace(False, 0)
    oh_train = pd.concat([oh_train, train[col]], axis=1)

    test[col] = test[col].replace(True, 1)
    test[col] = test[col].replace(False, 0)
    oh_test = pd.concat([oh_test, test[col]], axis=1)

In [21]:
for col in columns['numeric_variables']:
    if col != 'log_price':
        oh_train = pd.concat([oh_train, train[col]], axis=1)

        oh_test = pd.concat([oh_test, test[col]], axis=1)

In [22]:
add_to_test = list(set(oh_train.columns) - set(oh_test.columns))
add_to_train = list(set(oh_test.columns) - set(oh_train.columns))
for col in add_to_train:
    oh_train[col] = 0
for col in add_to_test:
    oh_test[col] = 0

oh_test = oh_test[oh_train.columns]

  oh_train[col] = 0
  oh_test[col] = 0


In [23]:
# oh_train = oh_train.drop('host_since_in_years', axis=1)
# oh_test = oh_test.drop('host_since_in_years', axis=1)

# oh_train = oh_train.drop('first_review_in_years', axis=1)
# oh_test = oh_test.drop('first_review_in_years', axis=1)

# oh_train = oh_train.drop('last_review_in_years', axis=1)
# oh_test = oh_test.drop('last_review_in_years', axis=1)

In [24]:
linear_regression = linear_model.LinearRegression()
# separate labels from data=
train_class = oh_train['log_price']
oh_train_data = oh_train.drop('log_price', axis=1)
# train the model:
linear_regression.fit(oh_train_data, train_class)
print(linear_regression)

LinearRegression()


In [25]:
train_y = oh_train['log_price'].values
train_x = oh_train.drop('log_price', axis=1)
test_y = oh_test['log_price'].values
test_x = oh_test.drop('log_price', axis=1)

prediction_test = linear_regression.predict(test_x)

In [26]:
print("Mean Absolute Error (Σ|y-pred|/n):", "{:,.3f}".format(mean_absolute_error(test_y, prediction_test)))
print("Mean Squared Error (Σ(|y-pred|/y)/n):",
      "{:,.3f}".format(mean_squared_error(test_y, prediction_test)))
print("Root Mean Squared Error (sqrt(Σ(y-pred)^2/n)):", "{:,.3f}".
      format(np.sqrt(mean_squared_error(test_y, prediction_test))))

Mean Absolute Error (Σ|y-pred|/n): 0.316
Mean Squared Error (Σ(|y-pred|/y)/n): 0.183
Root Mean Squared Error (sqrt(Σ(y-pred)^2/n)): 0.428


### This addition did not casue a significant change.
- should we drop these columns?
- should we think of a better use of the originals columns?
- will a more sophisticated model be more influenced by these columns?