In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_squared_error as mse
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv('AirBnb_Data.csv')

In [3]:
df.drop(['Unnamed: 0', 'id', 'thumbnail_url', 'latitude', 'longitude', 'name', 'description', 'first_review', 'last_review'], axis = 1, inplace = True)

In [4]:
df['host_response_rate'] = df['host_response_rate'].str.rstrip('%')
df['host_response_rate']

0        NaN
1        100
2        100
3        NaN
4        100
        ... 
74106    NaN
74107    100
74108    100
74109    100
74110    100
Name: host_response_rate, Length: 74111, dtype: object

In [5]:
df.dropna(subset = ['zipcode', 'host_since'], inplace = True)

df['review_scores_rating'].fillna(df['review_scores_rating'].mean(), inplace = True)
df['review_scores_rating'] = pd.to_numeric(df['review_scores_rating'], errors='coerce')

df['host_response_rate'] = pd.to_numeric(df['host_response_rate'], errors='coerce')
df['host_response_rate'].fillna(df['host_response_rate'].median(), inplace = True)

df['bathrooms'].fillna(df['bathrooms'].median(), inplace=True)
df['bedrooms'].fillna(df['bedrooms'].median(), inplace=True)
df['beds'].fillna(df['beds'].median(), inplace=True)

df['host_has_profile_pic'].fillna('f', inplace = True)
df['host_identity_verified'].fillna('f', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['review_scores_rating'].fillna(df['review_scores_rating'].mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['host_response_rate'].fillna(df['host_response_rate'].median(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will

In [6]:
df.isnull().sum()

log_price                    0
property_type                0
room_type                    0
amenities                    0
accommodates                 0
bathrooms                    0
bed_type                     0
cancellation_policy          0
cleaning_fee                 0
city                         0
host_has_profile_pic         0
host_identity_verified       0
host_response_rate           0
host_since                   0
instant_bookable             0
neighbourhood             6480
number_of_reviews            0
review_scores_rating         0
zipcode                      0
bedrooms                     0
beds                         0
dtype: int64

In [7]:
zipcode_dict = df.groupby('zipcode')['neighbourhood'].apply(list).to_dict()

list_of_dicts = []

for zipcode, neighbourhoods in zipcode_dict.items():
    neighbourhood_count = {}
    for neighbourhood in set(neighbourhoods):  # Remove duplicates
        neighbourhood_count[neighbourhood] = neighbourhoods.count(neighbourhood)  # Count occurrences
    zipcode_dict[zipcode] = neighbourhood_count



def most_popular_neighbourhood(missing_zipcode):
    neighbourhood_dict = zipcode_dict[missing_zipcode]
    max_key = max(neighbourhood_dict, key=neighbourhood_dict.get)  
    return max_key


# now for the missing values we fill in the most popular neighbourhood in that zipcode
def fill_neighbourhood(row):
    if pd.isna(row['neighbourhood']):
        return most_popular_neighbourhood(row['zipcode'])  # Your function
    return row['neighbourhood']

df['neighbourhood'] = df.apply(fill_neighbourhood, axis=1)

In [8]:
df['zipcode'] = df['zipcode'].astype(str).str[:5]
df.drop(index=df[df['zipcode'] == '1m'].index, inplace=True)

df['zipcode'] = pd.to_numeric(df['zipcode'], errors='coerce')

In [9]:
df.dropna(inplace = True)

In [10]:
df.isnull().sum()

log_price                 0
property_type             0
room_type                 0
amenities                 0
accommodates              0
bathrooms                 0
bed_type                  0
cancellation_policy       0
cleaning_fee              0
city                      0
host_has_profile_pic      0
host_identity_verified    0
host_response_rate        0
host_since                0
instant_bookable          0
neighbourhood             0
number_of_reviews         0
review_scores_rating      0
zipcode                   0
bedrooms                  0
beds                      0
dtype: int64

In [11]:
# creating a host tenure column after which we drop the host_since column 
import datetime
today = datetime.datetime.today()
df['host_since'] = pd.to_datetime(df['host_since'])
df['host_experience'] = np.ceil((today - df['host_since']).dt.days/365.25)
df['host_experience']

  df['host_since'] = pd.to_datetime(df['host_since'])


0        13.0
1         8.0
2         9.0
3        10.0
4        11.0
         ... 
74106    12.0
74107     9.0
74108    14.0
74109     8.0
74110    13.0
Name: host_experience, Length: 71586, dtype: float64

In [12]:
df['price'] = np.exp(df['log_price'])

In [13]:
df['cleaning_fee'] = df['cleaning_fee'].astype('str').map({'True': 't', 'False': 'f'})
for col in ['host_identity_verified', 'instant_bookable', 'cleaning_fee', 'host_has_profile_pic']:
    df[col] = df[col].map({'t':1, 'f':0})
df['cancellation_policy'] = df['cancellation_policy'].map({'flexible': 0, 'moderate': 1, 'strict': 2, 'super_strict': 3})

In [14]:
t_f_mapping = {'t': 1, 'f': 0}
cancellation_mapping = {'flexible': 0, 'moderate': 1, 'strict': 2, 'super_strict': 3}

In [15]:
number_of_amenities = []
df['amenities'] = df['amenities'].astype('str')

def amenities_count(amenities):
    amenities_list = amenities.replace('"', '').replace("{", '').replace("}",'').split(',')
    return len(amenities_list)

df['amenities'] = df['amenities'].apply(amenities_count)

In [16]:
df.drop(['log_price', 'host_since'], axis = 1, inplace = True)

In [17]:
df['review_scores_rating'] = df['review_scores_rating'].astype('int64')

In [18]:
# label encoding  room_type, bed_type, city
encoders = {}
for col in ['room_type', 'bed_type', 'city']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))

In [19]:
# one-hot encoding for  property_type, neighbourhood

one_hot_encoders = {}

for col in ['neighbourhood', 'property_type']:
    one_hot = OneHotEncoder(sparse_output = False)
    transformed = one_hot.fit_transform(df[[col]])
    
    transformed_df = pd.DataFrame(transformed, 
                                  columns = [f"{col}_{cat}" for cat in one_hot.categories_[0]],
                                 index = df.index)
    df = pd.concat([df, transformed_df], axis = 1)
    df.drop(columns = [col], inplace = True)
    
    one_hot_encoders[col] = one_hot
    
print(one_hot_encoders)

{'neighbourhood': OneHotEncoder(sparse_output=False), 'property_type': OneHotEncoder(sparse_output=False)}


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71586 entries, 0 to 74110
Columns: 670 entries, room_type to property_type_Yurt
dtypes: float64(659), int64(11)
memory usage: 366.5 MB


In [21]:
X = df.drop('price', axis=1).values
y = df['price'].values
imputer = SimpleImputer(strategy = 'median')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

In [22]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1))  # Output layer for regression

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
model.compile(optimizer=Adam(learning_rate=0.0001), 
              loss='mse', 
              metrics=['mae'])

In [25]:
history = model.fit(X_train, y_train, 
                    epochs=60, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1)

Epoch 1/60
[1m1432/1432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 818us/step - loss: 41618.0938 - mae: 130.0989 - val_loss: 19992.1973 - val_mae: 73.8987
Epoch 2/60
[1m1432/1432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 761us/step - loss: 18571.7734 - mae: 74.8828 - val_loss: 17518.9434 - val_mae: 67.3985
Epoch 3/60
[1m1432/1432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 757us/step - loss: 16184.6602 - mae: 68.3333 - val_loss: 16340.2324 - val_mae: 64.0854
Epoch 4/60
[1m1432/1432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 763us/step - loss: 16849.4277 - mae: 67.4212 - val_loss: 15626.7012 - val_mae: 62.9942
Epoch 5/60
[1m1432/1432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 763us/step - loss: 15251.7979 - mae: 64.7958 - val_loss: 15209.1162 - val_mae: 61.2911
Epoch 6/60
[1m1432/1432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 803us/step - loss: 15418.6875 - mae: 64.2523 - val_loss: 14981.9658 - val_mae: 59.2418
Epo

In [26]:
y_pred = model.predict(X_test)

[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420us/step


In [27]:
print(r2_score(y_test, y_pred))

0.5613401157649305
