In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
airbnb_data = pd.read_csv('/kaggle/input/new-york-city-airbnb-open-data/AB_NYC_2019.csv')

print(airbnb_data.head())
print(airbnb_data.describe())

In [None]:
airbnb_data.dropna(inplace=True)

sns.histplot(airbnb_data['price'], kde=True)
plt.xlabel('Price')
plt.ylabel('Count')
plt.title('Distribution of Listing Prices')
plt.show()

sns.countplot(x='room_type', data=airbnb_data)
plt.xlabel('Room Type')
plt.ylabel('Count')
plt.title('Number of Listings by Room Type')
plt.show()

In [None]:
airbnb_data.rename(columns={'neighbourhood_group': 'boroname'}, inplace=True)
borough_count = airbnb_data.groupby('boroname').agg('count').reset_index()
 
fig, ax1 = plt.subplots(1,1,figsize=(6,6))

sns.barplot(x='boroname', y='id', data=borough_count, ax=ax1)

ax1.set_title('Number of Listings by Neighbourhood', fontsize=15)
ax1.set_xlabel('Neighbourhood', fontsize = 12)
ax1.set_ylabel('Count', fontsize = 12)
ax1.tick_params(axis='both', labelsize=10)

In [None]:
import geopandas as gpd
nyc=gpd.read_file(gpd.datasets.get_path('nybb'))
nyc.rename(columns={'BoroName':'boroname'}, inplace=True)
bc_geo = nyc.merge(borough_count, on='boroname')

fig, ax=plt.subplots(1,1,figsize=(10,10))
bc_geo.plot(column='id', cmap='viridis_r', alpha=0.5, ax=ax, legend=True)

bc_geo.apply(lambda x: ax.annotate(
    text=x.boroname,
    color = 'black',
    xy=x.geometry.centroid.coords[0],
    ha='center',
    fontsize=8),
    axis=1)

plt.title('Number of Airbnb Listings by NYC Borough')
plt.axis('off')
plt.show()

In [None]:
print(airbnb_data.dtypes)

In [None]:
airbnb_data.head()

In [None]:
# Select only numeric columns
numeric_data = airbnb_data.select_dtypes(include=['number'])

# Replace infinite values and drop rows with NaN to avoid errors
numeric_data = numeric_data.replace([np.inf, -np.inf], np.nan).dropna()

# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()

# Correlation Analysis
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
avg_price_by_borough = airbnb_data.groupby('boroname')['price'].mean().reset_index()
bc_geo_avg_price = nyc.merge(avg_price_by_borough, on='boroname')

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
bc_geo_avg_price.plot(column='price', cmap='YlOrRd', alpha=0.6, ax=ax, legend=True)

for _, row in bc_geo_avg_price.iterrows():
    ax.annotate(
        text=f"${row.price:.2f}",
        xy=(row.geometry.centroid.x, row.geometry.centroid.y),
        ha='center',
        fontsize=8,
        color='black'
    )

plt.title("Average Airbnb Price by NYC Borough")
plt.axis('off')
plt.show()

In [None]:
room_type_distribution = airbnb_data.groupby(['boroname', 'room_type']).size().unstack().reset_index()
room_type_distribution.set_index('boroname', inplace=True)

room_type_distribution.plot(kind='bar', stacked=True, colormap='Set3')
plt.xlabel('Neighbourhood')
plt.ylabel('Count')
plt.title('Room Type Distribution by NYC Borough')
plt.legend(title='Room Type')
plt.show()

In [None]:
selected_columns = ['host_id', 'boroname', 'neighbourhood', 'room_type', 'minimum_nights',
                   'number_of_reviews', 'reviews_per_month', 'availability_365', 'price']
airbnb_data_selected = airbnb_data[selected_columns]

categorical_columns = ['boroname', 'neighbourhood', 'room_type']
airbnb_data_encoded = pd.get_dummies(airbnb_data_selected, columns=categorical_columns)

X = airbnb_data_encoded.drop('price', axis=1)
y = airbnb_data_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))