In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

sns.set()  


In [None]:
data=pd.read_csv(r"Airbnb_Open_Data.csv")
data.head(4)

## Dataset Overview

In [None]:
print(f"Shape of data: {data.shape}")

In [None]:
print("\nInfo:")
print(data.info())

In [None]:
print("\nDescribe (numeric):")
print(data.describe())

#### Variable Description (high level)
#
##### **- Dependent / target variable**: price
##### **- Example independent variables**:
#####     host_identity_verified
#####     neighbourhood_group
#####     neighbourhood
#####     lat, long
#####     room_type
#####     Construction year
#####     service fee
#####     minimum nights, number of reviews, reviews per month
#####     review rate number
#####     calculated host listings count
#####     availability 365
#####     instant_bookable
#####     cancellation_policy
#####     and other descriptive columns

## Data Cleaning and Preprocessing

##### Droping Unnecessary Columns

In [None]:
cols_to_drop = ['id', 'NAME', 'host id', 'host name', 'country', 
                'country code', 'last review', 'license']
data.drop(columns=cols_to_drop, inplace=True)

print("Columns remaining after drop:", data.columns.tolist())

#### Price and service fee to numeric

In [None]:
if "price" in data.columns:
    data["price"] = (
        data["price"]
        .astype(str)
        .str.replace("[\\$,]", "", regex=True)
        .replace("", np.nan)
        .astype(float)
    )


In [None]:
if "service fee" in data.columns:
    data["service fee"] = (
        data["service fee"]
        .astype(str)
        .str.replace("[\\$,]", "", regex=True)
        .replace("", np.nan)
        .astype(float)
    )

##### Checking missing values

In [None]:
print("\nMissing values per column:")
print(data.isna().sum())

#### Imputing missing values

##### Categorical Colmns

In [None]:
fill_cats = {
    "host_identity_verified": "unconfirmed",
    "neighbourhood group": "Manhattan",
    "neighbourhood": "Bedford-Stuyvesant",
    "instant_bookable": data["instant_bookable"].mode()[0]
    if "instant_bookable" in data.columns
    else None,
    "cancellation_policy": "moderate",
}

for col, val in fill_cats.items():
    if col in data.columns and val is not None:
        data[col] = data[col].fillna(val)

##### Numerical Columns

In [None]:
for col in ['lat', 'long', 'Construction year', 'service fee', 'minimum nights', 'number of reviews', 
            'reviews per month', 'review rate number', 'calculated host listings count', 
            'availability 365', 'price']:
    data[col].fillna(data[col].median(), inplace=True)


In [None]:
print("\nMissing values after filling:")
print(data.isna().sum())

#### **EDA**

##### Distribution of Price

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(data["price"], bins=50, kde=True)
plt.title("Distribution of Price")
plt.xlabel("Price")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


#### Boxplot of price to check outliers

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=data["price"])
plt.title("Boxplot of Price")
plt.xlabel("Price")
plt.tight_layout()
plt.show()

#### Numeric pairplot (sampling if data is large)

In [None]:
num_cols_for_pair = [
    "lat",
    "long",
    "Construction year",
    "price",
    "service fee",
    "minimum nights",
    "number of reviews",
    "reviews per month",
    "review rate number",
    "calculated host listings count",
    "availability 365",
]

num_cols_for_pair = [c for c in num_cols_for_pair if c in data.columns]

sample_df = data[num_cols_for_pair].sample(
    n=min(2000, len(data)), random_state=42
)

sns.pairplot(sample_df, diag_kind="kde")
plt.suptitle("Pairplot of Selected Numerical Features", y=1.02)
plt.show()

#### Categorical distributions

In [None]:
if "neighbourhood group" in data.columns:
    plt.figure(figsize=(10, 4))
    sns.countplot(x="neighbourhood group", data=data)
    plt.title("Neighbourhood Group Counts")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
if "room type" in data.columns:
    plt.figure(figsize=(10, 4))
    sns.countplot(x="room type", data=data)
    plt.title("Room Type Counts")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
if "cancellation_policy" in data.columns:
    plt.figure(figsize=(10, 4))
    sns.countplot(x="cancellation_policy", data=data)
    plt.title("Cancellation Policy Counts")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

#### Outlier Handling

In [None]:
def multi_graphs(data):
    for col in data.columns:
        plt.figure(figsize=(6, 4))
        sns.boxplot(data[col])
        plt.title(f'Plot of {col}')
        plt.xlabel(col)
        plt.show()
data.columns = data.columns.str.strip()
multi_graphs(data[['lat','long','Construction year','price','service fee','minimum nights','number of reviews','reviews per month','review rate number',
                'calculated host listings count','availability 365']])

In [None]:
lat_q1=data['lat'].quantile(0.25)
lat_q3=data['lat'].quantile(0.75)
lat_iqr=lat_q3-lat_q1
pos_lat_outliers=lat_q3+1.5*lat_iqr
neg_lat_outliers=lat_q1-1.5*lat_iqr

data['lat']=np.where(data['lat']>pos_lat_outliers,pos_lat_outliers,
                                 np.where(data['lat']<neg_lat_outliers,neg_lat_outliers,data['lat']))

In [None]:
long_q1=data['long'].quantile(0.25)
long_q3=data['long'].quantile(0.75)
long_iqr=long_q3-long_q1
pos_long_outliers=long_q3+1.5*long_iqr
neg_long_outliers=long_q1-1.5*long_iqr

data['long']=np.where(data['long']>pos_long_outliers,pos_long_outliers,
                                 np.where(data['long']<neg_long_outliers,neg_long_outliers,data['long']))

In [None]:
minimum_nights_q1=data['minimum nights'].quantile(0.25)
minimum_nights_q3=data['minimum nights'].quantile(0.75)
minimum_nights_iqr=minimum_nights_q3-minimum_nights_q1
pos_minimum_nights_outliers=minimum_nights_q3+1.5*minimum_nights_iqr
neg_minimum_nights_outliers=minimum_nights_q1-1.5*minimum_nights_iqr

data['minimum nights']=np.where(data['minimum nights']>pos_minimum_nights_outliers,pos_minimum_nights_outliers,
                                 np.where(data['minimum nights']<neg_minimum_nights_outliers,neg_minimum_nights_outliers,data['minimum nights']))

In [None]:
nr_q1=data['number of reviews'].quantile(0.25)
nr_q3=data['number of reviews'].quantile(0.75)
nr_iqr=nr_q3-nr_q1
pos_nr_outliers=nr_q3+1.5*nr_iqr
neg_nr_outliers=nr_q1-1.5*nr_iqr

data['number of reviews']=np.where(data['number of reviews']>pos_nr_outliers,pos_nr_outliers,
                                 np.where(data['number of reviews']<neg_nr_outliers,neg_nr_outliers,data['number of reviews']))

In [None]:
rpm_q1=data['reviews per month'].quantile(0.25)
rpm_q3=data['reviews per month'].quantile(0.75)
rpm_iqr=rpm_q3-rpm_q1
pos_rpm_outliers=rpm_q3+1.5*rpm_iqr
neg_rpm_outliers=rpm_q1-1.5*rpm_iqr

data['reviews per month']=np.where(data['reviews per month']>pos_rpm_outliers,pos_rpm_outliers,
                                 np.where(data['reviews per month']<neg_rpm_outliers,neg_rpm_outliers,data['reviews per month']))

In [None]:
hlc_q1=data['calculated host listings count'].quantile(0.25)
hlc_q3=data['calculated host listings count'].quantile(0.75)
hlc_iqr=hlc_q3-hlc_q1
pos_hlc_outliers=hlc_q3+1.5*hlc_iqr
neg_hlc_outliers=hlc_q1-1.5*hlc_iqr

data['calculated host listings count']=np.where(data['calculated host listings count']>pos_hlc_outliers,pos_hlc_outliers,
                                 np.where(data['calculated host listings count']<neg_hlc_outliers,neg_hlc_outliers,
                                          data['calculated host listings count']))

In [None]:
q1 = data["price"].quantile(0.25)
q3 = data["price"].quantile(0.75)
iqr = q3 - q1
cap_low = q1 - 1.5 * iqr
cap_high = q3 + 1.5 * iqr

data["price_capped"] = np.where(
    data["price"] > cap_high,
    cap_high,
    np.where(data["price"] < cap_low, cap_low, data["price"]),
)

plt.figure(figsize=(8, 4))
sns.boxplot(x=data["price_capped"])
plt.title("Boxplot of Price After Capping")
plt.xlabel("Capped Price")
plt.tight_layout()
plt.show()

##### Replace original price with capped version for modeling

In [None]:
data["price"] = data["price_capped"]
data.drop(columns=["price_capped"], inplace=True)

In [None]:
s=data[['lat', 'long', 'Construction year', 'price', 'service fee',
       'minimum nights', 'number of reviews', 'reviews per month',
       'review rate number', 'calculated host listings count',
       'availability 365']]

In [None]:
Q1=s.quantile(0.25)
Q3=s.quantile(0.75)
IQR=Q3-Q1
print(IQR)
U_BOUND=Q3+1.5*IQR
L_BOUND=Q1-1.5*IQR
print(U_BOUND)
print(L_BOUND)

In [None]:
def multi_graphs(data):
    for col in data.columns:
        plt.figure(figsize=(6, 4))
        sns.boxplot(data[col])
        plt.title(f'Plot of {col}')
        plt.xlabel(col)
        plt.show()
data.columns = data.columns.str.strip()
multi_graphs(data[['lat','long','Construction year','price','service fee','minimum nights','number of reviews','reviews per month','review rate number',
                'calculated host listings count','availability 365']])

#### Encoding Categorical Variables

In [None]:
if "instant_bookable" in data.columns:
    data["instant_bookable"] = data["instant_bookable"].astype(str)
    data["instant_bookable"] = (data["instant_bookable"].str.lower().isin(["t", "true", "1"])).astype(int)

if "host_identity_verified" in data.columns:
    data["host_identity_verified"] = data["host_identity_verified"].astype(str)
    data["host_identity_verified"] = (
        data["host_identity_verified"].str.lower().isin(["verified", "t", "true"])
    ).astype(int)

cat_cols = []
for c in ["neighbourhood group", "cancellation_policy", "room type"]:
    if c in data.columns:
        cat_cols.append(c)

data = pd.get_dummies(data, columns=cat_cols, drop_first=True)

print("\nData types after encoding:")
print(data.dtypes)

### Dependent (target variable)

In [None]:
y=data[['price']]
y.head()
print("Shape of y:", y.shape)

### Independent Variable

In [None]:
x=data.drop('price',axis=True)
x.head()
print("\nShape of X:", x.shape)

### Splitting the training and testing data

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=100)

### Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,root_mean_squared_error

In [None]:
linear_model=LinearRegression()
linear_model.fit(x_train,y_train)

In [None]:
y_test_pred=linear_model.predict(x_test)
y_train_pred=linear_model.predict(x_train)

In [None]:
r2_lr_train = r2_score(y_train, y_train_pred)
r2_lr_test = r2_score(y_test,y_test_pred)
rmse_lr_test = root_mean_squared_error(y_test, y_test_pred, squared=False)


In [None]:
print("\nLinear Regression:")
print("  Train R2:", r2_lr_train)
print("  Test  R2:", r2_lr_test)
print("  Test RMSE:", rmse_lr_test)

### Ridge Regularization

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge=Ridge(alpha=1.0)
ridge.fit(x_train,y_train)

In [None]:
y_train_pred_ridge = ridge.predict(x_train)
y_test_pred_ridge = ridge.predict(x_test)
rmse_ridge_test = root_mean_squared_error(y_test, y_test_pred_ridge, squared=False)

In [None]:
r2_ridge_train=r2_score(y_train,y_train_pred_ridge)
r2_ridge_test=r2_score(y_test,y_test_pred_ridge)
print("Train Accuracy: ",r2_ridge_train)
print("Test Accuracy: ",r2_ridge_test)
print("Test RMSE:", rmse_ridge_test)

### Lasso Regression

In [None]:
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(x_train, y_train)

In [None]:
y_pred_lasso_test = lasso.predict(x_test)
r2_lasso_test = r2_score(y_test, y_pred_lasso_test)
rmse_lasso_test = root_mean_squared_error(y_test, y_pred_lasso_test, squared=False)

In [None]:
print("\nLasso Regression:")
print("  Test R2:", r2_lasso_test)
print("  Test RMSE:", rmse_lasso_test)

#### Model Performance Comparision 

In [None]:
models = ["Linear", "Lasso", "Ridge"]
r2_scores = [r2_lr_test, r2_lasso_test, r2_ridge_test]
rmses = [rmse_lr_test, rmse_lasso_test, rmse_ridge_test]

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(x=models, y=r2_scores)
plt.title("R2 Score Comparison")
plt.ylabel("R2")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(x=models, y=rmses)
plt.title("RMSE Comparison")
plt.ylabel("RMSE")
plt.tight_layout()
plt.show()

#### KDE comparison of predictions vs actual

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(y_test, label="Actual", linestyle="--")
sns.kdeplot(y_test_pred, label="Linear")
sns.kdeplot(y_pred_lasso_test, label="Lasso")
sns.kdeplot(y_test_pred_ridge, label="Ridge")
plt.title("Predicted vs Actual Price Distribution")
plt.xlabel("Price")
plt.legend()
plt.tight_layout()
plt.show()