In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

### Data Collection

In [3]:
# Path to the directory containing CSV files
folder_path = 'data/'

# Initialize an empty dictionary to store dataframes
dfs = {}

# Loop through files in the directory
for file_name in os.listdir(folder_path):
    
    df = pd.read_csv(folder_path + file_name + '/listings.csv')
    print(file_name + ' done')
    dfs[file_name] = df

Montreal done
New Brunswick done
Ottawa done
Quebec City done
Toronto done
Vancouver done
Victoria done
Winnipeg done


### Functions

In [4]:
def heatmap(df):
    # Compute the correlation matrix
    correlation_matrix = df.corr()

    # Plot the heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.show()

In [5]:
def correlation_activity(df):

    correlation_matrix = df.corr()

    # Check for inverse correlation as well
    correlation_with_activity = correlation_matrix['number_of_reviews'].abs().sort_values(ascending=False)
    
    return correlation_with_activity[1:]

In [6]:
def model_importance(df):
    # Train Random Forest model
    X = df.drop(columns=['number_of_reviews'])
    y = df['number_of_reviews']

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)

    # Get feature importances
    feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
    sorted_importances = feature_importances.sort_values(ascending=False)

    return sorted_importances

In [7]:
#Debug Function
def check_null(df):
    columns_with_nan = df.columns[df.isna().any()].tolist()
    nan_counts = df[columns_with_nan].isna().sum()

    for column in columns_with_nan:
        print(f"Column '{column}' has {nan_counts[column]} NaN value(s).")

In [14]:
def rq1_preprocessing(df):
    # Filter columns containing 'host'
    host_features = ['host_response_rate', 'host_acceptance_rate',
                    'host_listings_count', 'host_total_listings_count', 'host_verifications',
                    'host_has_profile_pic', 'host_identity_verified', 'host_is_superhost']

    # Select relevant columns from the dataset
    df = df[host_features + ['number_of_reviews']]

    for col in ['host_response_rate', 'host_acceptance_rate']:
        df[col] = df[col].str.rstrip('%').astype(float) / 100.0

    df = pd.get_dummies(df, columns=['host_verifications'])

    columns_tf = ['host_has_profile_pic', 'host_identity_verified', 'host_is_superhost']
    df[columns_tf] = df[columns_tf].replace({'t': 1, 'f': 0, '': 0})

    df.dropna(inplace=True)

    return df

In [9]:
def rq2_preprocessing(df):
    # Filter columns containing 'host'
    features = ['longitude', 'latitude', 'neighbourhood_cleansed']

    # Select relevant columns from the dataset
    df = df[features + ['number_of_reviews']]

    label_encoder = LabelEncoder()

    # Fit the LabelEncoder to your data and transform it
    df['neighbourhood_cleansed'] = label_encoder.fit_transform(df['neighbourhood_cleansed'])

    return df

In [10]:
def rq3_preprocessing(df):

    df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
    columns_tf = ['host_has_profile_pic', 'host_identity_verified', "instant_bookable"]
    df[columns_tf] = df[columns_tf].replace({'t': 1, 'f': 0, '': 0})

    for col in ['host_response_rate', 'host_acceptance_rate']:
        df[col] = df[col].str.rstrip('%').astype(float) / 100.0

    df.drop(columns=["id", "last_scraped", "name", "host_id", "host_name", "host_about", "host_neighbourhood", 
                            "scrape_id", "description", "latitude", "longitude", "has_availability", "neighbourhood_group_cleansed",
                            "bathrooms", "bedrooms", "calendar_updated", "license"], inplace=True)

    df = df.select_dtypes(include='number')

    threshold = len(df.columns) / 2

    # Filter rows with more than the threshold number of null values
    df = df[df.isnull().sum(axis=1) <= threshold]
    
    df = df.dropna()

    return df

In [11]:
def avg_correlation(preprocess_function):

    correlations = [] 
    for city in dfs:
        df = preprocess_function(dfs[city].copy())
        correlations.append(correlation_activity(df))
    average_correlation = pd.concat(correlations).groupby(level=0).mean()
    print('Feature Correlation from Correlation Matrix: ')
    print(average_correlation.sort_values(ascending=False).head())

In [12]:
def avg_importance(preprocess_function):
    importances = []  # Initialize a list to store correlation values for each run

    for city in dfs:
        df = preprocess_function(dfs[city].copy())
        if df.empty:
            print(f"Skipping city '{city}' due to empty dataframe.")
            continue
        importances.append(model_importance(df))
    average_importances = pd.concat(importances).groupby(level=0).mean()

    print('\nFeature Importance from RandomForestRegressor: ')
    print(average_importances.sort_values(ascending=False).head())

## Research Questions

### RQ1

In [15]:
avg_correlation(rq1_preprocessing)
avg_importance(rq1_preprocessing)

Feature Correlation from Correlation Matrix: 
host_is_superhost            0.249755
host_acceptance_rate         0.149762
host_listings_count          0.098076
host_total_listings_count    0.086380
host_response_rate           0.081630
Name: number_of_reviews, dtype: float64

Feature Importance from RandomForestRegressor: 
host_acceptance_rate         0.248919
host_total_listings_count    0.233011
host_listings_count          0.211301
host_is_superhost            0.126505
host_response_rate           0.070906
dtype: float64


### RQ2

In [None]:
avg_correlation(rq2_preprocessing)
avg_importance(rq2_preprocessing)

Feature Correlation from Correlation Matrix: 
latitude                  0.067578
neighbourhood_cleansed    0.065007
longitude                 0.040826
Name: number_of_reviews, dtype: float64

Feature Importance from RandomForestRegressor: 
longitude                 0.488106
latitude                  0.478040
neighbourhood_cleansed    0.033854
dtype: float64


### RQ3

In [None]:
avg_correlation(rq3_preprocessing)
avg_importance(rq3_preprocessing)

Feature Correlation from Correlation Matrix: 
number_of_reviews_ltm     0.637376
reviews_per_month         0.517382
number_of_reviews_l30d    0.327071
host_acceptance_rate      0.135894
availability_365          0.133873
Name: number_of_reviews, dtype: float64



Feature Importance from RandomForestRegressor: 
number_of_reviews_ltm          0.442519
reviews_per_month              0.122443
maximum_nights                 0.046030
review_scores_communication    0.041919
review_scores_checkin          0.032210
dtype: float64
