The comprehensive analysis provides insights into factors affecting Airbnb pricing,
ratings, and other key metrics across different cities, 
room types, and other listing characteristics.

In [1]:
import pandas as pd

In [2]:
df1=pd.read_csv("C:/Users/davek/OneDrive/Desktop/SSDI/DATASETS/AIRBNB_1.csv")
df1.head()

Unnamed: 0,City,Price,Day,Room_Type,Shared_Room,Private_Room,Person_Capacity,Superhost,Multiple_Rooms,Business,Cleanliness_Rating,Guest_Satisfaction,Bedrooms,City_Center,Metro_Distance,Attraction_Index,Normalised_Attraction,Restraunt_Index,Normalised_Restraunt_Index
0,Amsterdam,194.033698,Weekday,Private room,False,True,2,False,1,0,10,93,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473
1,Amsterdam,344.245776,Weekday,Private room,False,True,4,False,0,0,8,85,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928
2,Amsterdam,264.101422,Weekday,Private room,False,True,2,False,0,1,9,87,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467
3,Amsterdam,433.529398,Weekday,Private room,False,True,4,False,0,1,9,90,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565
4,Amsterdam,485.552926,Weekday,Private room,False,True,2,True,0,0,10,98,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677


In [3]:
df1.dtypes
df1['Private_Room'] = df1['Private_Room'].astype(int)


In [4]:
df1.columns = df1.columns.str.strip()  # Remove leading and trailing spaces
float_columns = ['City_Center', 'Metro_Distance', 'Attraction_Index', 'Restraunt_Index','Normalised_Attraction','Normalised_Restraunt_Index']
df1[float_columns] = df1[float_columns].astype(int)


In [5]:
# List of float columns to convert
float_columns = ['City_Center', 'Metro_Distance', 'Attraction_Index', 'Restraunt_Index','Normalised_Attraction','Normalised_Restraunt_Index']

# Convert to integers (use round() if you prefer rounding instead of truncating)
df1[float_columns] = df1[float_columns].astype(int)

# Optional: check types after conversion
df1.columns = df1.columns.str.strip()

print(df1[float_columns].dtypes)

City_Center                   int32
Metro_Distance                int32
Attraction_Index              int32
Restraunt_Index               int32
Normalised_Attraction         int32
Normalised_Restraunt_Index    int32
dtype: object


In [6]:
df1.columns = df1.columns.str.strip()
df1.columns

Index(['City', 'Price', 'Day', 'Room_Type', 'Shared_Room', 'Private_Room',
       'Person_Capacity', 'Superhost', 'Multiple_Rooms', 'Business',
       'Cleanliness_Rating', 'Guest_Satisfaction', 'Bedrooms', 'City_Center',
       'Metro_Distance', 'Attraction_Index', 'Normalised_Attraction',
       'Restraunt_Index', 'Normalised_Restraunt_Index'],
      dtype='object')

In [7]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# List of dependent variables to analyze separately
dependent_vars = ['Price', 'Business', 'Cleanliness_Rating', 'Guest_Satisfaction','City_Center', 'Metro_Distance', 'Attraction_Index', 'Restraunt_Index','Normalised_Attraction','Normalised_Restraunt_Index','Private_Room']
independent_vars = 'City + Day + Room_Type + Superhost+ Shared_Room+Person_Capacity'

# Run ANOVA for each dependent variableShared_Room+Private_Room+Person_Capacity
for dv in dependent_vars:
    formula = f'{dv} ~ {independent_vars}'
    model = ols(formula, data=df1).fit()
    anova_table = sm.stats.anova_lm(model, type=2)  # Type II ANOVA
    print(f"ANOVA results for {dv}:\n", anova_table, "\n")


ANOVA results for Price:
                       df        sum_sq       mean_sq            F    PR(>F)
City                 8.0  4.459781e+08  5.574726e+07   914.037914  0.000000
Day                  1.0  2.962764e+05  2.962764e+05     4.857778  0.027527
Room_Type            2.0  1.586287e+08  7.931433e+07  1300.445886  0.000000
Superhost            1.0  4.754575e+04  4.754575e+04     0.779565  0.377279
Shared_Room          1.0  2.402586e+04  2.402586e+04     0.393930  0.530244
Person_Capacity      1.0  1.082586e+08  1.082586e+08  1775.018420  0.000000
Residual         41700.0  2.543287e+09  6.099010e+04          NaN       NaN 

ANOVA results for Business:
                       df       sum_sq    mean_sq           F        PR(>F)
City                 8.0   604.496374  75.562047  366.512747  0.000000e+00
Day                  1.0     0.608392   0.608392    2.951000  8.583188e-02
Room_Type            2.0    34.554863  17.277432   83.803962  4.757185e-37
Superhost            1.0    87.8919

In [8]:
# from sklearn.preprocessing import LabelEncoder

# # Create the encoder
# le = LabelEncoder()

# # Fit and transform the 'Day' column
# df1["Day_encoded"] = le.fit_transform(df1["Day"])
#   # Use the same encoder for consistency
# df1

In [9]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Price"], df1['Day'])
print(tukey)


 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj  lower   upper  reject
-----------------------------------------------------
Weekday Weekend   6.1894 0.0237 0.8269 11.5519   True
-----------------------------------------------------


In [10]:
df2=df1[df1['Day']=='Weekday']
df3=df1[df1['Day']=='Weekend']

In [11]:
from scipy import stats
stats.ttest_ind(df2["Price"],df3["Price"],equal_var=False,alternative="greater")
#reject null. setosa > versicolor

TtestResult(statistic=-2.2629217372192243, pvalue=0.9881770745798281, df=40006.55168714004)

In [12]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order

tukey = pairwise_tukeyhsd(df1["Business"], df1['Room_Type'])
print(tukey)


        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
     group1        group2    meandiff p-adj   lower   upper  reject
-------------------------------------------------------------------
Entire home/apt Private room  -0.0195 0.0003 -0.0312 -0.0078   True
Entire home/apt  Shared room   0.3069    0.0  0.2442  0.3696   True
   Private room  Shared room   0.3264    0.0  0.2633  0.3896   True
-------------------------------------------------------------------


In [13]:
df4=df1[df1['Room_Type']=='Private room']
df5=df1[df1['Room_Type']=='Shared room']


In [14]:
from scipy import stats
stats.ttest_ind(df4["Business"],df5["Business"],equal_var=False,alternative="greater")
#reject null. setosa > versicolor

TtestResult(statistic=-12.022404318130976, pvalue=1.0, df=329.7847191219171)

In [15]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Cleanliness_Rating"], df1['Shared_Room'])
print(tukey)


Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
 False   True  -0.5062   0.0 -0.6045 -0.4079   True
---------------------------------------------------


In [16]:
df6 = df1[df1['Shared_Room'] == False]
df7 = df1[df1['Shared_Room'] == True]
df6.head()

Unnamed: 0,City,Price,Day,Room_Type,Shared_Room,Private_Room,Person_Capacity,Superhost,Multiple_Rooms,Business,Cleanliness_Rating,Guest_Satisfaction,Bedrooms,City_Center,Metro_Distance,Attraction_Index,Normalised_Attraction,Restraunt_Index,Normalised_Restraunt_Index
0,Amsterdam,194.033698,Weekday,Private room,False,1,2,False,1,0,10,93,1,5,2,78,4,98,6
1,Amsterdam,344.245776,Weekday,Private room,False,1,4,False,0,0,8,85,1,0,0,631,33,837,58
2,Amsterdam,264.101422,Weekday,Private room,False,1,2,False,0,1,9,87,1,5,3,75,3,95,6
3,Amsterdam,433.529398,Weekday,Private room,False,1,4,False,0,1,9,90,2,0,0,493,26,875,60
4,Amsterdam,485.552926,Weekday,Private room,False,1,2,True,0,0,10,98,1,0,0,552,29,815,56


In [17]:
from scipy import stats
stats.ttest_ind(df6["Cleanliness_Rating"],df7["Cleanliness_Rating"],equal_var=False,alternative="greater")
#reject null. setosa > versicolor

TtestResult(statistic=7.276866987492516, pvalue=1.348493698938569e-12, df=317.47699733796645)

In [18]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Guest_Satisfaction"], df1['City'])
print(tukey)


   Multiple Comparison of Means - Tukey HSD, FWER=0.05    
  group1    group2  meandiff p-adj   lower   upper  reject
----------------------------------------------------------
Amsterdam    Athens   0.4892 0.3094 -0.1552  1.1336  False
Amsterdam Barcelona  -3.4054    0.0 -4.1241 -2.6866   True
Amsterdam    Berlin  -0.1908 0.9969 -0.9306  0.5491  False
Amsterdam  Budapest   0.0709    1.0 -0.6014  0.7431  False
Amsterdam    Lisbon  -3.4205    0.0 -4.0573 -2.7838   True
Amsterdam     Paris  -2.4769    0.0 -3.1018  -1.852   True
Amsterdam      Rome  -1.3921    0.0 -1.9976 -0.7867   True
Amsterdam    Vienna  -0.7833 0.0123 -1.4711 -0.0955   True
   Athens Barcelona  -3.8945    0.0 -4.4742 -3.3148   True
   Athens    Berlin  -0.6799 0.0147 -1.2856 -0.0743   True
   Athens  Budapest  -0.4183 0.2363 -0.9393  0.1027  False
   Athens    Lisbon  -3.9097    0.0 -4.3839 -3.4355   True
   Athens     Paris  -2.9661    0.0 -3.4243 -2.5078   True
   Athens      Rome  -1.8813    0.0 -2.3126   -1.45   Tr

In [19]:
df8 = df1[df1['City'] == 'Barcelona']
df9 = df1[df1['City'] == 'Budapest']
df8.head()

Unnamed: 0,City,Price,Day,Room_Type,Shared_Room,Private_Room,Person_Capacity,Superhost,Multiple_Rooms,Business,Cleanliness_Rating,Guest_Satisfaction,Bedrooms,City_Center,Metro_Distance,Attraction_Index,Normalised_Attraction,Restraunt_Index,Normalised_Restraunt_Index
7360,Barcelona,474.317499,Weekday,Entire home/apt,False,0,4,False,0,1,10,91,1,1,0,526,17,915,20
7361,Barcelona,169.897829,Weekday,Private room,False,1,2,True,1,0,10,88,1,1,0,320,10,794,17
7362,Barcelona,161.984779,Weekday,Private room,False,1,4,False,0,1,9,88,1,1,0,344,11,840,18
7363,Barcelona,367.956804,Weekday,Entire home/apt,False,0,3,False,0,1,10,91,1,1,0,400,13,946,20
7364,Barcelona,196.895292,Weekday,Private room,False,1,3,False,1,0,9,91,1,1,0,346,11,792,17


In [20]:
from scipy import stats
stats.ttest_ind(df8["Guest_Satisfaction"],df9["Guest_Satisfaction"],equal_var=False,alternative="greater")

TtestResult(statistic=-18.136293298491488, pvalue=1.0, df=5010.991536558746)

In [21]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["City_Center"], df1['Room_Type'])
print(tukey)


       Multiple Comparison of Means - Tukey HSD, FWER=0.05        
     group1        group2    meandiff p-adj   lower  upper  reject
------------------------------------------------------------------
Entire home/apt Private room   0.5863    0.0  0.5368 0.6357   True
Entire home/apt  Shared room   0.6547    0.0    0.39 0.9195   True
   Private room  Shared room   0.0685 0.8188 -0.1979 0.3349  False
------------------------------------------------------------------


In [22]:
df10=df1[df1['Room_Type']=='Entire home/apt']
df11=df1[df1['Room_Type']=='Shared room']
df10.head()


Unnamed: 0,City,Price,Day,Room_Type,Shared_Room,Private_Room,Person_Capacity,Superhost,Multiple_Rooms,Business,Cleanliness_Rating,Guest_Satisfaction,Bedrooms,City_Center,Metro_Distance,Attraction_Index,Normalised_Attraction,Restraunt_Index,Normalised_Restraunt_Index
7,Amsterdam,2771.307384,Weekday,Entire home/apt,False,0,4,True,0,0,10,100,3,1,1,208,11,272,18
8,Amsterdam,1001.80442,Weekday,Entire home/apt,False,0,4,False,0,0,9,96,2,3,1,106,5,133,9
10,Amsterdam,909.474375,Weekday,Entire home/apt,False,0,2,False,0,0,10,96,1,1,0,409,21,555,38
12,Amsterdam,675.60284,Weekday,Entire home/apt,False,0,4,False,0,0,8,87,1,2,0,214,11,269,18
13,Amsterdam,552.808567,Weekday,Entire home/apt,False,0,2,True,0,0,10,100,1,1,1,325,17,390,27


In [23]:
from scipy import stats
stats.ttest_ind(df10["City_Center"],df11["City_Center"],equal_var=False,alternative="greater")

TtestResult(statistic=-5.257400061731352, pvalue=0.9999998661415979, df=319.88775709604306)

In [24]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Metro_Distance"], df1['Room_Type'])
print(tukey)


       Multiple Comparison of Means - Tukey HSD, FWER=0.05        
     group1        group2    meandiff p-adj   lower  upper  reject
------------------------------------------------------------------
Entire home/apt Private room   0.0417    0.0  0.0254  0.058   True
Entire home/apt  Shared room  -0.0394 0.5396 -0.1265 0.0478  False
   Private room  Shared room  -0.0811  0.077 -0.1688 0.0066  False
------------------------------------------------------------------


In [25]:
df12=df1[df1['Room_Type']=='Entire home/apt']
df13=df1[df1['Room_Type']=='Private room']

In [26]:
from scipy import stats
stats.ttest_ind(df12["Metro_Distance"],df13["Metro_Distance"],equal_var=False,alternative="greater")

TtestResult(statistic=-5.710054780417246, pvalue=0.9999999942825913, df=22723.00931770228)

In [27]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Attraction_Index"], df1['Person_Capacity'])
print(tukey)


 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower    upper  reject
-----------------------------------------------------
     2      3 -11.3117 0.0181 -21.3567 -1.2668   True
     2      4  -2.4388 0.9033  -9.9743  5.0968  False
     2      5   3.0612  0.973 -10.5546 16.6771  False
     2      6  16.6004 0.0011   4.8589 28.3419   True
     3      4    8.873 0.1493  -1.7139 19.4599  False
     3      5   14.373 0.0847  -1.1411 29.8871  False
     3      6  27.9122    0.0   14.014 41.8104   True
     4      5      5.5 0.8219  -8.5204 19.5204  False
     4      6  19.0392 0.0002   6.8308 31.2476   True
     5      6  13.5392 0.1735  -3.1236  30.202  False
-----------------------------------------------------


In [28]:
df14=df1[df1['Person_Capacity']==3]
df15=df1[df1['Person_Capacity']==6]
df12.head()


Unnamed: 0,City,Price,Day,Room_Type,Shared_Room,Private_Room,Person_Capacity,Superhost,Multiple_Rooms,Business,Cleanliness_Rating,Guest_Satisfaction,Bedrooms,City_Center,Metro_Distance,Attraction_Index,Normalised_Attraction,Restraunt_Index,Normalised_Restraunt_Index
7,Amsterdam,2771.307384,Weekday,Entire home/apt,False,0,4,True,0,0,10,100,3,1,1,208,11,272,18
8,Amsterdam,1001.80442,Weekday,Entire home/apt,False,0,4,False,0,0,9,96,2,3,1,106,5,133,9
10,Amsterdam,909.474375,Weekday,Entire home/apt,False,0,2,False,0,0,10,96,1,1,0,409,21,555,38
12,Amsterdam,675.60284,Weekday,Entire home/apt,False,0,4,False,0,0,8,87,1,2,0,214,11,269,18
13,Amsterdam,552.808567,Weekday,Entire home/apt,False,0,2,True,0,0,10,100,1,1,1,325,17,390,27


In [29]:
from scipy import stats
stats.ttest_ind(df14["Attraction_Index"],df15["Attraction_Index"],equal_var=False,alternative="greater")

TtestResult(statistic=-5.121263913759569, pvalue=0.9999998441821394, df=6967.394357153462)

In [30]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Restraunt_Index"], df1['Room_Type'])
print(tukey)


          Multiple Comparison of Means - Tukey HSD, FWER=0.05           
     group1        group2     meandiff p-adj    lower     upper   reject
------------------------------------------------------------------------
Entire home/apt Private room    5.2191 0.6087   -7.6605   18.0987  False
Entire home/apt  Shared room -197.2904    0.0 -266.2826 -128.2983   True
   Private room  Shared room -202.5095    0.0 -271.9397 -133.0794   True
------------------------------------------------------------------------


In [31]:
df16=df1[df1['Room_Type']=='Entire home/apt']
df17=df1[df1['Room_Type']=='Shared room']

In [32]:
from scipy import stats
stats.ttest_ind(df16["Restraunt_Index"],df17["Restraunt_Index"],equal_var=False,alternative="greater")

TtestResult(statistic=10.476650624926995, pvalue=1.072062134502558e-22, df=333.9852115800485)

In [33]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Normalised_Attraction"], df1['Person_Capacity'])
print(tukey)


Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     2      3  -1.4559    0.0 -1.8122 -1.0995   True
     2      4  -0.9567    0.0  -1.224 -0.6893   True
     2      5   -2.015    0.0 -2.4981  -1.532   True
     2      6  -0.7855    0.0 -1.2021  -0.369   True
     3      4   0.4992 0.0027  0.1236  0.8748   True
     3      5  -0.5592 0.0443 -1.1096 -0.0088   True
     3      6   0.6703 0.0019  0.1773  1.1634   True
     4      5  -1.0584    0.0 -1.5558  -0.561   True
     4      6   0.1711  0.818  -0.262  0.6043  False
     5      6   1.2295    0.0  0.6384  1.8207   True
----------------------------------------------------


In [34]:
df18=df1[df1['Person_Capacity']==3]
df19=df1[df1['Person_Capacity']==6]

In [35]:
from scipy import stats
stats.ttest_ind(df18["Normalised_Attraction"],df19["Normalised_Attraction"],equal_var=False,alternative="greater")

TtestResult(statistic=-3.6685919664814697, pvalue=0.9998771647178014, df=7120.28086504692)

In [36]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Correct parameter order
tukey = pairwise_tukeyhsd(df1["Normalised_Restraunt_Index"], df1['Room_Type'])
print(tukey)


        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
     group1        group2    meandiff p-adj   lower   upper  reject
-------------------------------------------------------------------
Entire home/apt Private room  -2.9454    0.0 -3.4018 -2.4889   True
Entire home/apt  Shared room  -0.1602 0.9871 -2.6051  2.2847  False
   Private room  Shared room   2.7852 0.0218  0.3247  5.2456   True
-------------------------------------------------------------------


In [37]:
df20=df1[df1['Room_Type']=='Private room']
df21=df1[df1['Room_Type']=='Shared room']

In [38]:
from scipy import stats
stats.ttest_ind(df18["Normalised_Restraunt_Index"],df19["Normalised_Restraunt_Index"],equal_var=False,alternative="greater")

TtestResult(statistic=-4.446159118541693, pvalue=0.9999955646182036, df=7327.99525751078)