In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [21]:
calendar_df = pd.read_csv('calendar.csv')
calendar_df

Unnamed: 0,calender_id,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,1,40334325,2022-08-03 00:00:00.000000,0,56.0,56.0,3,5
1,2,22742449,2022-11-13 00:00:00.000000,1,95.0,95.0,2,99
2,3,34621717,2022-04-17 00:00:00.000000,0,75.0,75.0,2,1125
3,4,38281744,2022-01-31 00:00:00.000000,1,150.0,150.0,1,1000
4,5,18835003,2022-05-21 00:00:00.000000,0,100.0,100.0,2,1125
...,...,...,...,...,...,...,...,...
319187,319188,52729945,2022-07-06 00:00:00.000000,1,64.0,64.0,1,10
319188,319189,36953202,2022-07-08 00:00:00.000000,0,140.0,140.0,3,1125
319189,319190,39580214,2022-06-13 00:00:00.000000,1,49.0,49.0,1,13
319190,319191,49016014,2022-10-26 00:00:00.000000,0,60.0,60.0,2,30


In [22]:
listings_df = pd.read_csv('listings.csv')
listings_df

Unnamed: 0,listing_id,listing_url,name,description,latitude,longitude,property_type,room_type,accomodates,bathrooms_text,bedrooms,beds,amenities,host_id
0,50904,https://www.airbnb.com/rooms/50904,aplace/antwerp: cosy suite - fashion district,Decorated in a vintage style combined with a f...,51.218575,4.398631,Room in boutique hotel,Hotel room,2,1 private bath,1.0,1.0,"[""Kitchen"", ""Smoke alarm"", ""Hair dryer"", ""Keyp...",234077
1,116134,https://www.airbnb.com/rooms/116134,Spacious apartment nearby Mas,Enjoy your stay at our 4 person apartment in t...,51.230510,4.405930,Entire rental unit,Entire home/apt,4,2.5 baths,2.0,2.0,"[""Refrigerator"", ""Elevator"", ""Paid street park...",586942
2,218916,https://www.airbnb.com/rooms/218916,Apartment with terrace in trendy Zurenborg,Do you enjoy authentic places with a lot of ch...,51.206330,4.429420,Entire condominium (condo),Entire home/apt,5,1 bath,1.0,3.0,"[""Paid street parking off premises"", ""Kitchen""...",915664
3,224333,https://www.airbnb.com/rooms/224333,Large stylish room in 1930s house + garden,"Large bedroom in classic 1930s house. Kitchen,...",51.197720,4.458530,Private room in residential home,Private room,2,2 shared baths,1.0,1.0,"[""Fire extinguisher"", ""Backyard"", ""Long term s...",1167377
4,224682,https://www.airbnb.com/rooms/224682,APARTMENT ROSCAM - OLD CENTRE ANTWERP,"<b>The space</b><br />Apartment ""Roscam"" is a ...",51.217220,4.397900,Entire rental unit,Entire home/apt,3,1 bath,1.0,2.0,"[""Refrigerator"", ""Kitchen"", ""Smoke alarm"", ""Sh...",1263933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1744,53916524,https://www.airbnb.com/rooms/53916524,Industrial spacious loft in Antwerp!,Industrial loft (110m²) with a relaxing urban ...,51.231330,4.403520,Entire loft,Entire home/apt,2,1.5 baths,1.0,1.0,"[""Refrigerator"", ""Elevator"", ""Yamaha RX-A550 s...",46300712
1745,53928545,https://www.airbnb.com/rooms/53928545,Studio in het midden van Antwerpen,Recent gerenoveerde rustige studio van circa 2...,51.213210,4.397080,Entire rental unit,Entire home/apt,2,1 bath,,1.0,"[""Hot water"", ""Long term stays allowed"", ""Kitc...",74268936
1746,53929354,https://www.airbnb.com/rooms/53929354,Mooi appartement met open haard,Vanuit deze ideaal gelegen accommodatie kun je...,51.200340,4.421060,Entire rental unit,Entire home/apt,4,1 bath,1.0,2.0,"[""Kitchen"", ""Smoke alarm"", ""Outdoor furniture""...",334036639
1747,53949105,https://www.airbnb.com/rooms/53949105,Kasteel Boterlaerhof vlakbij Antwerpen,"Kasteel Boterlaerhof ligt ideaal gelegen, op 1...",51.212320,4.482230,Castle,Entire home/apt,16,7.5 baths,7.0,33.0,"[""Refrigerator"", ""Private garden or backyard"",...",377595714


In [None]:
hosts_df = pd.read_csv('hosts.csv')
hosts_df

In [None]:
reviews_df = pd.read_csv('reviews.csv')
reviews_df

In [None]:
calendar_rows = calendar_df.shape[0]
calendar_rows

In [None]:
unique_listings = calendar_df['listing_id'].nunique()
unique_listings

In [None]:
# Transformation on the price column in Calendar table (handle missing values, normalization)
# Assuming normalization to handle varying scales of prices

In [None]:
calendar_df['price_normalized'] = (calendar_df['price'] - calendar_df['price'].mean()) / calendar_df['price'].std()
calendar_df['price_normalized']

In [None]:
# Need to identify potential transformations for predictor variables affecting price
# Also we need to explore Listings, Hosts, and Reviews tables to identify predictor variables
# Extract relevant features from Listings table
# Need to mention details of every variables from each table with price comparison
listings_features = listings_df[['listing_id', 'property_type', 'room_type', 'accomodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities']]
listings_features

In [None]:
#As per the data shared we have to merge the listing feature with calender data set for aggregated value.
aggregated_df = pd.merge(calendar_df, listings_features, on='listing_id', how='inner')
aggregated_df

In [None]:
aggregated_df.info()

In [None]:
# Data Quality Report Function
def data_quality_report(df):
    report = {}

    # Continuous Variables Analysis
    continuous_vars = df.select_dtypes(include=['float64', 'int64']).columns
    for col in continuous_vars:
        report[col] = {
            '#unique values': df[col].nunique(),
            'percentage_missing_values': df[col].isnull().mean() * 100,
            'min': df[col].min(),
            'max': df[col].max(),
            'average': df[col].mean(),
            '25th percentile': df[col].quantile(0.25),
            '75th percentile': df[col].quantile(0.75),
            '90th percentile': df[col].quantile(0.90),
            '95th percentile': df[col].quantile(0.95)
        }

 # Visualize distribution of continuous variables using histograms and box plots
        plt.figure(figsize=(10, 6))
        plt.subplot(1, 2, 1)
        sns.histplot(df[col], bins=20, kde=True)
        plt.title(f'Histogram of {col}')
        plt.subplot(1, 2, 2)
        sns.boxplot(y=df[col])
        plt.title(f'Box Plot of {col}')
        plt.show()
        
         # Categorical Variables Analysis
    categorical_vars = df.select_dtypes(include=['object']).columns
    for col in categorical_vars:
        report[col] = {
            '#unique values': df[col].nunique(),
            'percentage_missing_values': df[col].isnull().mean() * 100
        }
        # Visualize frequency of unique values for categorical variables using bar plots
        plt.figure(figsize=(10, 6))
        sns.countplot(data=df, x=col, order=df[col].value_counts().index)
        plt.title(f'Frequency of {col}')
        plt.xticks(rotation=45)
        plt.show()

    return report

# Generate Data Quality Report
data_report = data_quality_report(aggregated_df)

# Print Data Quality Report
for col, stats in data_report.items():
    print(f"Column: {col}")
    print(stats)
    print("\n")
        

In [None]:
# Define target variable and predictor variables
target_variable = 'price'
predictor_variables = ['property_type', 'room_type', 'accomodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities']

# Assessing the relationship between target and predictor variables
correlation_matrix = aggregated_df[[target_variable] + predictor_variables].corr()

# Plot heatmap of correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# Plot bivariate relationships (scatter plots)
for predictor in predictor_variables:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=aggregated_df, x=predictor, y=target_variable)
    plt.title(f'Bivariate Relationship: {predictor} vs {target_variable}')
    plt.xlabel(predictor)
    plt.ylabel(target_variable)
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Drop any non-numeric columns or handle them appropriately
aggregated_df = aggregated_df.select_dtypes(include=['number'])

# Define target variable and predictor variables
target_variable = 'price'
predictor_variables = aggregated_df.columns.tolist()
predictor_variables.remove(target_variable)

# Handling missing values if any
aggregated_df.fillna(0, inplace=True)  # Filling missing values with 0, you can choose a different strategy

# Creating train/test/validation splits
X = aggregated_df[predictor_variables]
y = aggregated_df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
linear_regression_model = LinearRegression()
regression_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)
gbm_model = GradientBoostingRegressor(random_state=42)

# Fit models
linear_regression_model.fit(X_train, y_train)
regression_tree_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
gbm_model.fit(X_train, y_train)

# Evaluate models
models = {'Linear Regression': linear_regression_model,
          'Regression Tree': regression_tree_model,
          'Random Forest': random_forest_model,
          'Gradient Boosting Machine': gbm_model}

metrics = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    metrics[name] = {'Mean Squared Error': mse,
                     'Mean Absolute Error': mae,
                     'R-squared': r2}

# Create comparison matrix
comparison_matrix = pd.DataFrame(metrics).T

# Print comparison matrix
print("\nComparison Matrix for Regression Models:")
print(comparison_matrix)