In [56]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore", category=FutureWarning)
import json
from scipy.stats import skew, kurtosis, normaltest
import plotly.express as px

# **1. załadowałem dane bezpośrednio ze źródła internetowego**

In [57]:
df = pd.read_csv("https://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2024-09-28/data/listings.csv.gz")

# **2. sprawdzam rozmiar danych, ile rekordów, ile kolumn**
**mamy 21 528 wierszy oraz 75 kolumn opisujących dane;
kompletność danych różni się od zupełnie pustych kolumn np. "neighbourhood_group_cleansed" po uzupełnienie w 100% np. "room_type"**

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21528 entries, 0 to 21527
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            21528 non-null  int64  
 1   listing_url                                   21528 non-null  object 
 2   scrape_id                                     21528 non-null  int64  
 3   last_scraped                                  21528 non-null  object 
 4   source                                        21528 non-null  object 
 5   name                                          21528 non-null  object 
 6   description                                   20828 non-null  object 
 7   neighborhood_overview                         8731 non-null   object 
 8   picture_url                                   21528 non-null  object 
 9   host_id                                       21528 non-null 

# **3. wyświetlam próbkę danych (10 pierwszych wierszy) obejmującą wszystkie kolumny, w tym celu zdejmuję ograniczenie dla liczby wyświetlanych kolumn w notatniku (zdjąłem też ograniczenie dotyczące wierszy, przyda się w dalszej części analizy)**

In [59]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head(10)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,338928,https://www.airbnb.com/rooms/338928,20240928011742,2024-09-28,previous scrape,Art&Design apt in hip Nørrebro - 2BR,"Bright, cozy, and well-furnished 3rd floor des...",The apartment is located in trendy Nørrebro wi...,https://a0.muscache.com/pictures/35e8972e-36a9...,113348,https://www.airbnb.com/users/show/113348,Samy,2010-04-25,"Copenhagen, Denmark","Danish guy living and working in Copenhagen, D...",within an hour,100%,100%,f,https://a0.muscache.com/im/pictures/user/82e23...,https://a0.muscache.com/im/pictures/user/82e23...,Nørrebro,1.0,1.0,"['email', 'phone']",t,t,"Copenhagen, Capital Region of Denmark, Denmark",Nrrebro,,55.69388,12.54725,Entire condo,Entire home/apt,2,,1 bath,1.0,,"[""Drying rack for clothing"", ""Coffee maker: fr...",,3,1125,3,3,1125,1125,3.0,1125.0,,t,0,0,0,0,2024-09-28,98,3,0,2012-07-01,2023-12-10,4.9,4.85,4.92,4.91,4.95,4.9,4.79,,f,1,1,0,0,0.66
1,31094,https://www.airbnb.com/rooms/31094,20240928011742,2024-09-28,previous scrape,"Beautiful, spacious, central, renovated Penthouse","Welcome to our home, we hope you will enjoy Wo...","What else is nearby?<br />To be honest, We thi...",https://a0.muscache.com/pictures/miso/Hosting-...,129976,https://www.airbnb.com/users/show/129976,Ebbe,2010-05-22,"Copenhagen, Denmark","Hi and welcome. My name is Ebbe, I am a medica...",,,,f,https://a0.muscache.com/im/users/129976/profil...,https://a0.muscache.com/im/users/129976/profil...,Vesterbro,1.0,1.0,"['email', 'phone']",t,t,"Copenhagen, Capital Region of Denmark, Denmark",Vesterbro-Kongens Enghave,,55.666602,12.555283,Entire condo,Entire home/apt,6,,1.5 baths,4.0,,"[""Drying rack for clothing"", ""Indoor fireplace...",,3,10,3,3,10,10,3.0,10.0,,t,0,0,0,0,2024-09-28,19,0,0,2010-08-16,2022-08-22,4.88,4.82,4.88,4.87,4.82,4.8,4.53,,f,1,1,0,0,0.11
2,32379,https://www.airbnb.com/rooms/32379,20240928011742,2024-09-28,previous scrape,"155 m2 artist flat on Vesterbro, with 2 bathrooms",You enter a narrow entrance and feel the good ...,"Værnedamsvej area is super hip area, we call i...",https://a0.muscache.com/pictures/miso/Hosting-...,140105,https://www.airbnb.com/users/show/140105,Lise,2010-06-07,"Copenhagen, Denmark",As profession - Set and Costumedesigner for Av...,within a few hours,100%,100%,t,https://a0.muscache.com/im/users/140105/profil...,https://a0.muscache.com/im/users/140105/profil...,Vesterbro,3.0,4.0,"['email', 'phone']",t,t,"Copenhagen, V, Denmark",Vesterbro-Kongens Enghave,,55.672638,12.552493,Entire rental unit,Entire home/apt,4,,2 baths,2.0,,"[""Drying rack for clothing"", ""Host greets you""...",,3,5,3,3,5,5,3.0,5.0,,t,0,0,0,0,2024-09-28,83,2,0,2010-08-23,2024-08-03,4.91,4.96,4.93,4.89,4.91,4.89,4.71,,f,2,1,1,0,0.48
3,32841,https://www.airbnb.com/rooms/32841,20240928011742,2024-09-28,previous scrape,Cozy flat for Adults/Quiet for kids,Our flat is placed in a Central AND Quiet neig...,,https://a0.muscache.com/pictures/28047599/8efd...,142143,https://www.airbnb.com/users/show/142143,Anders & Maria,2010-06-10,"Copenhagen, Denmark","Anders:\r\nHitchhiked 100.000 km's, Been publi...",,,,f,https://a0.muscache.com/im/pictures/user/88f79...,https://a0.muscache.com/im/pictures/user/88f79...,Østerbro,1.0,1.0,"['email', 'phone']",t,t,,sterbro,,55.71176,12.57091,Entire rental unit,Entire home/apt,4,,1 bath,2.0,,"[""Essentials"", ""Heating"", ""Smoke alarm"", ""Fire...",,100,1125,100,100,1125,1125,100.0,1125.0,,t,0,0,0,0,2024-09-28,7,0,0,2010-07-25,2016-09-15,4.57,4.75,4.5,5.0,5.0,4.5,4.5,,f,1,1,0,0,0.04
4,38499,https://www.airbnb.com/rooms/38499,20240928011742,2024-09-28,city scrape,0 min. from everything in Cph.,It doesn't get more central than this. Histori...,Area: <br />This is the eye of Cph. right betw...,https://a0.muscache.com/pictures/hosting/Hosti...,122489,https://www.airbnb.com/users/show/122489,Christina,2010-05-11,"Copenhagen, Denmark","We are...\nCarsten and Christina, and our 3 aw...",within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/5db78...,https://a0.muscache.com/im/pictures/user/5db78...,Indre By,1.0,4.0,"['email', 'phone']",t,t,"Copenhagen, Denmark",Indre By,,55.684288,12.573019,Entire condo,Entire home/apt,6,1.0,1 bath,2.0,2.0,"[""Drying rack for clothing"", ""Host greets you""...","$3,000.00",5,1125,7,14,1125,1125,7.4,1125.0,,t,0,0,0,84,2024-09-28,34,10,2,2010-08-21,2024-09-21,4.94,4.82,4.89,4.82,5.0,4.93,4.89,,f,1,1,0,0,0.2
5,39055,https://www.airbnb.com/rooms/39055,20240928011742,2024-09-28,city scrape,Stylish apartment in central Copenhagen,"Big, bright, airy and attractive apartment dec...",The flat is excellently located in Islands Bry...,https://a0.muscache.com/pictures/hosting/Hosti...,167511,https://www.airbnb.com/users/show/167511,Rikke,2010-07-15,"Copenhagen, Denmark",Experienced airbnb host with a lovely apartmen...,within an hour,100%,64%,f,https://a0.muscache.com/im/pictures/user/a700a...,https://a0.muscache.com/im/pictures/user/a700a...,Islands Brygge,1.0,1.0,"['email', 'phone']",t,t,"Copenhagen, Hovedstaden, Denmark",Amager Vest,,55.66507,12.58315,Entire condo,Entire home/apt,4,2.0,2 baths,2.0,2.0,"[""Drying rack for clothing"", ""Smoke alarm"", ""C...","$2,143.00",2,30,1,2,30,30,2.0,30.0,,t,14,20,45,312,2024-09-28,105,13,2,2010-08-14,2024-09-09,4.83,4.84,4.98,4.9,4.94,4.73,4.66,,f,1,1,0,0,0.61
6,338992,https://www.airbnb.com/rooms/338992,20240928011742,2024-09-28,city scrape,Room with a view over the lakes,Please notice:<br />- Our two rooms are in our...,"We live in Vesterbro, which is one of the most...",https://a0.muscache.com/pictures/15de73bc-a869...,799887,https://www.airbnb.com/users/show/799887,Bente,2011-07-10,"Copenhagen, Denmark","I am a young woman, in the middle of the fifti...",within an hour,100%,100%,t,https://a0.muscache.com/im/users/799887/profil...,https://a0.muscache.com/im/users/799887/profil...,Vesterbro,3.0,3.0,"['email', 'phone']",t,t,"Copenhagen, Denmark",Vesterbro-Kongens Enghave,,55.67365,12.55581,Private room in rental unit,Private room,1,1.0,1 shared bath,2.0,1.0,"[""Essentials"", ""Hangers"", ""Heating"", ""Smoke al...","$2,000.00",1,5,1,1,5,5,1.0,5.0,,t,0,0,20,20,2024-09-28,377,1,0,2012-03-17,2024-06-13,4.58,4.65,4.5,4.79,4.69,4.85,4.58,,t,3,0,3,0,2.47
7,339126,https://www.airbnb.com/rooms/339126,20240928011742,2024-09-28,city scrape,Bright vesterbro design apartment with big bal...,(Brand New kitchen installed on 1st of May 202...,Right next to the apartment you will find Værn...,https://a0.muscache.com/pictures/miso/Hosting-...,1589153,https://www.airbnb.com/users/show/1589153,Michael & Bille Max,2012-01-10,"Copenhagen, Denmark",Michael is a Creative Director. Bille Max is a...,within an hour,100%,97%,t,https://a0.muscache.com/im/pictures/user/dd23b...,https://a0.muscache.com/im/pictures/user/dd23b...,Vesterbro,2.0,2.0,"['email', 'phone']",t,t,"Copenhagen, Capital Region of Denmark, Denmark",Vesterbro-Kongens Enghave,,55.67284,12.54717,Entire condo,Entire home/apt,5,1.5,1.5 baths,2.0,3.0,"[""Drying rack for clothing"", ""Sun loungers"", ""...","$2,214.00",3,1125,3,3,1125,1125,3.0,1125.0,,t,9,23,26,275,2024-09-28,97,13,1,2012-06-04,2024-09-17,4.89,4.92,4.77,4.88,4.95,4.92,4.69,,f,1,1,0,0,0.65
8,341822,https://www.airbnb.com/rooms/341822,20240928011742,2024-09-28,city scrape,Charming & historic building in the heart of Cph!,Come and spend some time in one of Copenhagen'...,The apartment is only 150 m. from the famous p...,https://a0.muscache.com/pictures/4049612/5ceb2...,1734238,https://www.airbnb.com/users/show/1734238,Ma,2012-02-11,"Copenhagen, Denmark","Hi \nI am a happy, easy going Dane, who loves ...",within a day,70%,39%,f,https://a0.muscache.com/im/pictures/user/f8de1...,https://a0.muscache.com/im/pictures/user/f8de1...,Indre By,1.0,1.0,"['email', 'phone']",t,t,"Copenhagen, Capital Region of Denmark, Denmark",Indre By,,55.68207,12.57336,Entire rental unit,Entire home/apt,2,1.0,1 bath,1.0,1.0,"[""Drying rack for clothing"", ""Clothing storage...","$1,380.00",2,4,2,2,4,4,2.0,4.0,,t,24,54,84,357,2024-09-28,220,12,1,2012-03-01,2024-09-06,4.73,4.82,4.91,4.72,4.83,4.97,4.7,,f,1,1,0,0,1.44
9,343137,https://www.airbnb.com/rooms/343137,20240928011742,2024-09-28,city scrape,Childfriendly penthouse with private terrace,Large penthouse apartment (150 m2) centrally l...,Quit & cosy but yet close to everything,https://a0.muscache.com/pictures/hosting/Hosti...,1741097,https://www.airbnb.com/users/show/1741097,Josephine,2012-02-12,"Copenhagen, Denmark","Hi, \r\nI’m Josephine - living in Copenhagen ...",a few days or more,0%,83%,f,https://a0.muscache.com/im/pictures/user/02ee7...,https://a0.muscache.com/im/pictures/user/02ee7...,Frederiksberg,1.0,1.0,"['email', 'phone']",t,f,"Frederiksberg, Capital Region of Denmark, Denmark",Frederiksberg,,55.67761,12.54673,Entire rental unit,Entire home/apt,4,1.0,1 bath,3.0,4.0,"[""Drying rack for clothing"", ""Host greets you""...","$1,995.00",3,14,3,3,14,14,3.0,14.0,,t,0,0,0,180,2024-09-28,31,2,0,2012-05-04,2024-07-31,4.97,4.87,4.81,4.94,4.9,4.97,4.71,,f,1,1,0,0,0.21


# **4. weryfikuję typy danych**

*   całkowite (int64) np: 'id'
*   zmiennoprzecinkowe (float64) np: 'host_listings_count'
*   kategoryczne np: 'host_response_time'
*   logiczne najlepiej nadaje się kolumna zawierająca dane typu tak/nie i tu   taką kolumną jest 'host_is_superhost', 'host_identity_verified', 'instant_bookable' lub 'has_availability' - tu zrobię mapowanie na TRUE/FALSE i typecasting na bool
*   daty najlepiej odwzrowuje kolumna 'first_review', 'last_review' tutaj dokonam typecasting na datetime

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21528 entries, 0 to 21527
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            21528 non-null  int64  
 1   listing_url                                   21528 non-null  object 
 2   scrape_id                                     21528 non-null  int64  
 3   last_scraped                                  21528 non-null  object 
 4   source                                        21528 non-null  object 
 5   name                                          21528 non-null  object 
 6   description                                   20828 non-null  object 
 7   neighborhood_overview                         8731 non-null   object 
 8   picture_url                                   21528 non-null  object 
 9   host_id                                       21528 non-null 

**weryfikuję czy dane są importowane w różnych okresach**

In [61]:
df['scrape_id'].unique(), df['last_scraped'].unique(), df['source'].unique()

(array([20240928011742]),
 array(['2024-09-28', '2024-09-30'], dtype=object),
 array(['previous scrape', 'city scrape'], dtype=object))

**widzimy, że daty importu to 28/09 oraz 30/09 2024 - można uznać że są z tego samego okresu**

**usatwiam indeks na kolumnie 'id'**

In [62]:
df.set_index('id', inplace=True)

**sprawdzam jakie wartości są w kolumnie 'host_response_time'**

In [63]:
df['host_response_time'].unique()

array(['within an hour', nan, 'within a few hours', 'within a day',
       'a few days or more'], dtype=object)

**zmieniam dane w kolumnie 'host_response_time' na dane kategoryczne**

In [64]:
df['host_response_time'] = df['host_response_time'].astype('category')

**zmieniam dane w kolumnach 'host_response_rate' i 'host_acceptance_rate' na dane zmiennoprzecinkowe (float64), muszę wykasować znak % na końcu danych**

In [65]:
df['host_response_rate'] = df['host_response_rate'].str[:-1].astype('float64')/100.0
df['host_acceptance_rate'] = df['host_acceptance_rate'].str[:-1].astype('float64')/100.0

**weryfikuję jakie wartości są w kolumnie 'host_is_superhost'**

In [66]:
df['host_is_superhost'].unique()

array(['f', 't', nan], dtype=object)

**weryfikuję jakie wartości są w kolumnie 'host_identity_verified'**

In [67]:
df['host_identity_verified'].unique()

array(['t', 'f', nan], dtype=object)

**weryfikuję jakie wartości są w kolumnie 'has_availability'**

In [68]:
df['has_availability'].unique()

array(['t', nan, 'f'], dtype=object)

**weryfikuję jakie wartości są w kolumnie 'instant_bookable'**

In [69]:
df['instant_bookable'].unique()

array(['f', 't'], dtype=object)

we wszystkich 4 kolumnach wartości przyjmują wartość t, f lub nan (czyli brak danych) - kolumny nadają się żeby dokonać mapowania na wartości logiczne TRUE/FALSE

**dokonuję mapowania danych w kolumnach t na TRUE a f na FALSE**

In [70]:
df['host_is_superhost'] = df['host_is_superhost'].map({'t': True, 'f': False}).fillna(False)
df['host_identity_verified'] = df['host_identity_verified'].map({'t': True, 'f': False}).fillna(False)
df['has_availability'] = df['has_availability'].map({'t': True, 'f': False}).fillna(False)
df['instant_bookable'] = df['instant_bookable'].map({'t': True, 'f': False}).fillna(False)

**zmieniam typ danych na daty dla kolumn 'first_review' oraz 'last_review'**

In [71]:
for col in ['first_review', 'last_review']:
    df[col] = pd.to_datetime(df[col])

**sprawdzam wartości w kolumnie 'price'**

In [72]:
df['price'].unique()

array([nan, '$3,000.00', '$2,143.00', ..., '$5,014.00', '$2,530.00',
       '$602.00'], dtype=object)

**zamieniam dane w kolumnie 'price' na float64, wycinam znak $**


In [73]:
df['price'] = df['price'].str.replace('[$,]', '', regex=True).astype('float')

**weryfikuę wartości w kolumnie 'host_verifications'**

In [74]:
df['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone', 'work_email']", "['phone']", "['email']",
       "['email', 'phone', 'photographer']", '[]',
       "['email', 'work_email']", nan], dtype=object)

**dokonuję rozdzielenia wartości zapisanych jako ciąg tzn. dokonuję rozbicia na listę w jednym wpisie**


In [75]:
df['host_verifications'] = df['host_verifications'].str.replace(r"[\[\],']", '', regex=True).str.split()

**sprawdzam jakie udogodnienia są oferowane, kolumna 'amenities'**

In [76]:
df['amenities'].unique()

array(['["Drying rack for clothing", "Coffee maker: french press, pour-over coffee", "Smoke alarm", "Conditioner", "Laundromat nearby", "Cooking basics", "Luggage dropoff allowed", "Iron", "Outdoor furniture", "Kitchen", "Coffee", "Shared backyard \\u2013 Fully fenced", "Private entrance", "Essentials", "Room-darkening shades", "BBQ grill: charcoal", "Refrigerator", "Dining table", "Outdoor dining area", "Hot water kettle", "Wifi", "Bed linens", "Oven", "Sonos  sound system", "Courtyard view", "Blender", "Microwave", "Long term stays allowed", "Gas stove", "Paid dryer \\u2013 In building", "Paid washer \\u2013 In building", "Self check-in", "Wine glasses", "Lockbox", "Central heating", "Paid parking on premises", "EV charger", "Hair dryer", "48 inch HDTV with Chromecast", "Shampoo", "Cleaning products", "Freezer", "Dishes and silverware", "Paid parking off premises", "Shower gel", "Toaster", "Baking sheet", "Hot water"]',
       '["Drying rack for clothing", "Indoor fireplace: wood-bur

**dokonuje rozbicia wpisów w kolumnie 'amenities' z jednego stringa na listę**

In [77]:
df['amenities'] = df['amenities'].apply(json.loads)

**sprawdzam ile jest łazienek, dane w kolumnie 'bathroom_text'**

In [78]:
df['bathrooms_text'].unique()

array(['1 bath', '1.5 baths', '2 baths', '1 shared bath',
       '2 shared baths', '3 baths', '0 baths', '2.5 baths', 'Half-bath',
       nan, '0 shared baths', '1 private bath', 'Shared half-bath',
       'Private half-bath', '3.5 baths', '1.5 shared baths', '5 baths',
       '4 baths', '2.5 shared baths', '3 shared baths', '8 baths'],
      dtype=object)

**tworzę kolumnę 'bathrooms_num', która będzie zawierać info o liczbie łazienek w formie liczby**

In [79]:
df['bathrooms_num'] = np.where(df['bathrooms_text'].str.contains('half', case=False, na=False),
         0.5,
         df['bathrooms_text'].str.split().str[0]).astype('float')

# **5. tworzę podsumowanie zmiennych**

**dla zmiennych numerycznych**

In [80]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
scrape_id,21528.0,20240928011742.0,20240928011742.0,20240928011742.0,20240928011742.0,20240928011742.0,20240928011742.0,8.918176
host_id,21528.0,149171334.234857,513.0,18027326.75,67437682.5,222707655.75,653826605.0,172895246.133016
host_response_rate,14058.0,0.900934,0.0,1.0,1.0,1.0,1.0,0.236245
host_acceptance_rate,17163.0,0.67288,0.0,0.44,0.75,1.0,1.0,0.326713
host_listings_count,21527.0,6.084499,1.0,1.0,1.0,1.0,837.0,35.899543
host_total_listings_count,21527.0,12.662749,1.0,1.0,1.0,2.0,1671.0,85.628249
neighbourhood_group_cleansed,0.0,,,,,,,
latitude,21528.0,55.680596,55.61566,55.666318,55.68195,55.696133,55.73247,0.019181
longitude,21528.0,12.558559,12.454,12.54075,12.55514,12.58049,12.63972,0.031197
accommodates,21528.0,3.319863,1.0,2.0,3.0,4.0,16.0,1.64343


**oraz podsumowanie dla zmiennych kategorycznych**

In [81]:
df.describe(exclude=np.number).T.loc[:, 'count':'freq']

Unnamed: 0,count,unique,top,freq
listing_url,21528,21528.0,https://www.airbnb.com/rooms/338928,1.0
last_scraped,21528,2.0,2024-09-28,21527.0
source,21528,2.0,city scrape,13098.0
name,21528,19434.0,Lejlighed i København,64.0
description,20828,20222.0,Enjoy a stylish experience in this centrally l...,44.0
neighborhood_overview,8731,8391.0,Nørrebro is a true melting pot of all things d...,12.0
picture_url,21528,21364.0,https://a0.muscache.com/pictures/950aadd4-4c99...,11.0
host_url,21528,19558.0,https://www.airbnb.com/users/show/187610263,235.0
host_name,21527,5599.0,ApartmentinCopenhagen,235.0
host_since,21527,4534.0,2018-05-03,237.0


# **6. sprawdzam ile jest braków w kolumnach w ramce danych**

In [82]:
df.isnull().sum()

Unnamed: 0,0
listing_url,0
scrape_id,0
last_scraped,0
source,0
name,0
description,700
neighborhood_overview,12797
picture_url,0
host_id,0
host_url,0


# **7. dokonuję wizualizacji rozkładu wybranych kolumn**

**liczba łazienek, kolumna 'bathrooms_num'**

In [83]:
fig = px.histogram(df, x="bathrooms_num")
fig.show()

In [84]:
normaltest(df['bathrooms_num'].dropna())

NormaltestResult(statistic=16151.216274462771, pvalue=0.0)

**wyliczam skośność**

In [85]:
skew(df['bathrooms_num'].dropna())

3.2911072586736068

**mamy tu do czynienia z dodatnią skośnością, więc rozkład jest asymetryczny w prawo, większość danych znajduje się po lewej stronie średniej**

**wyliczam kurtozę**

In [86]:
kurtosis(df['bathrooms_num'].dropna())

23.352967983110734

**pokazuje, że mamy do czynienia z leptokurtycznym rozkładem, rozkład ma spiczasty wierzchołek i dłuższy ogon niż rozkład normalny**

**liczba sypialni, kolumna 'bedrooms'**

In [87]:
fig = px.histogram(df, x="bedrooms")
fig.show()

In [88]:
normaltest(df['bedrooms'].dropna())

NormaltestResult(statistic=4913.4181409153725, pvalue=0.0)

**wyliczam skośność**

In [89]:
skew(df['bedrooms'].dropna())

1.4025781687108185

**mamy to do czynienia z dodatnią skośnością, więc rozkład jest asymetryczny w prawo, większość danych znajduje się po lewej stronie średniej**

**wyliczam kurtozę**

In [90]:
kurtosis(df['bedrooms'].dropna())

1.9392323865537024

**pokazuje, że mamy do czynienia z leptokurtycznym rozkładem, rozkład ma spiczasty wierzchołek i dłuższy ogon niż rozkład normalny**

**liczba ocen miesięcznie, kolumna 'reviews_per_month'**

In [91]:
fig = px.histogram(df, x='reviews_per_month', nbins=100)
fig.show()

In [92]:
normaltest(df['reviews_per_month'].dropna())

NormaltestResult(statistic=26851.808322626362, pvalue=0.0)

**wyliczam skośność**

In [93]:
skew(df['reviews_per_month'].dropna())

7.81447236137666

**mamy to do czynienia z dodatnią skośnością, więc rozkład jest asymetryczny w prawo, większość danych znajduje się po lewej stronie średniej**

**wyliczam kurtozę**

In [94]:
kurtosis(df['reviews_per_month'].dropna())

180.04165981968617

**pokazuje, że mamy do czynienia z leptokurtycznym rozkładem, rozkład ma spiczasty wierzchołek i dłuższy ogon niż rozkład normalny**

**czas odpowiedzi od gospodarza, kolumna 'host_response_time'**

In [95]:
fig = px.histogram(df, x='host_response_time')
fig.show()

# **8. czyszczę dane**

**wykasowuję zbędne kolumny, które mają zbyt wiele braków, lub są niepotrzebne przy analizie**

In [96]:
df.drop(columns=['calendar_updated', 'neighbourhood_group_cleansed', 'host_neighbourhood', 'host_about', 'host_has_profile_pic',
                 'neighbourhood', 'neighborhood_overview', 'license', 'bathrooms'], inplace=True)

In [97]:
df.isnull().sum()

Unnamed: 0,0
listing_url,0
scrape_id,0
last_scraped,0
source,0
name,0
description,700
picture_url,0
host_id,0
host_url,0
host_name,1


**uzupełniam braki w kolumnach wartościami mediany dla poszczególnych kolumn numerycznych**

In [98]:
df['host_response_rate'] = df['host_response_rate'].fillna(df['host_response_rate'].median())

**weryfikuję czy już brak wartości nan**

In [99]:
df['host_response_rate'].isna().sum()

0

In [100]:
df['host_acceptance_rate'] = df['host_acceptance_rate'].fillna(df['host_acceptance_rate'].median())

In [101]:
df['beds'] = df['beds'].fillna(df['beds'].median())

In [102]:
df['price'] = df['price'].fillna(df['price'].median())

In [103]:
df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].median())

In [104]:
df['review_scores_accuracy'] = df['review_scores_accuracy'].fillna(df['review_scores_accuracy'].median())

In [105]:
df['review_scores_cleanliness'] = df['review_scores_cleanliness'].fillna(df['review_scores_cleanliness'].median())

In [106]:
df['review_scores_checkin'] = df['review_scores_checkin'].fillna(df['review_scores_checkin'].median())

In [107]:
df['review_scores_communication'] = df['review_scores_communication'].fillna(df['review_scores_communication'].median())

In [108]:
df['review_scores_location'] = df['review_scores_location'].fillna(df['review_scores_location'].median())

In [109]:
df['review_scores_value'] = df['review_scores_value'].fillna(df['review_scores_value'].median())

In [110]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(df['reviews_per_month'].median())

**uzupełniam braki w kolumnach wartością naczęściej występującą przy danych kategorycznych**

In [111]:
df['host_response_time'] = df['host_response_time'].fillna(df['host_response_time'].value_counts().index[0])

**weryfikuję czy już brak wartości nan**

In [112]:
df['host_response_time'].isna().sum()

0

**weryfikuję ile teraz jest braków w ramce danych**

In [113]:
df.isnull().sum()

Unnamed: 0,0
listing_url,0
scrape_id,0
last_scraped,0
source,0
name,0
description,700
picture_url,0
host_id,0
host_url,0
host_name,1


**sprawdzam jak kształtują się liczności poszczególnych cen**

In [114]:
fig = px.histogram(df, x="price", nbins=100)
fig.show()

**widać, że mamy do czynienia z outlier'ami, w tym celu przytnę dane**

In [115]:
print("95% ma cenę wynajmu niższą niż {0: .2f} $".format(np.percentile(df['price'], 95)))
df = df[(df['price'] <= np.percentile(df['price'], 95)) & (df['price'] > 10)]

95% ma cenę wynajmu niższą niż  2320.00 $


In [116]:
fig = px.histogram(df, x="price", nbins=100)
fig.show()

# **9. zbadam zależności między zmiennymi**

**tworzę wykres pokazujący zależności między ogólną oceną a oceną za komunikację**

In [117]:
fig = px.scatter(df, x='review_scores_rating', y='review_scores_communication')
fig.show()

**widać, że im wyższa ocena za komunikację tym zazwyczaj jest wysoka ogólna ocena**

**tworzę macierz korelacji ocen za różne rzeczy**

In [118]:
corr = df[['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
  'review_scores_location', 'review_scores_value', 'review_scores_communication', 'host_response_rate', 'host_acceptance_rate']].corr()
corr

Unnamed: 0,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_location,review_scores_value,review_scores_communication,host_response_rate,host_acceptance_rate
review_scores_rating,1.0,0.740108,0.694813,0.512167,0.423997,0.727419,0.612308,0.045771,-0.023156
review_scores_accuracy,0.740108,1.0,0.603752,0.460687,0.391094,0.665722,0.552213,0.048852,-0.01209
review_scores_cleanliness,0.694813,0.603752,1.0,0.387624,0.304073,0.608374,0.439324,0.067256,0.022521
review_scores_checkin,0.512167,0.460687,0.387624,1.0,0.294714,0.454154,0.575273,0.040365,-0.01656
review_scores_location,0.423997,0.391094,0.304073,0.294714,1.0,0.428142,0.332335,-0.004102,-0.029989
review_scores_value,0.727419,0.665722,0.608374,0.454154,0.428142,1.0,0.524336,0.04652,-0.0106
review_scores_communication,0.612308,0.552213,0.439324,0.575273,0.332335,0.524336,1.0,0.064614,-0.013655
host_response_rate,0.045771,0.048852,0.067256,0.040365,-0.004102,0.04652,0.064614,1.0,0.443415
host_acceptance_rate,-0.023156,-0.01209,0.022521,-0.01656,-0.029989,-0.0106,-0.013655,0.443415,1.0


**tworzę mapę cieplną pokazującą korelację między ocenami za poszczególne aspekty**

In [119]:
fig = px.imshow(corr, text_auto=True, aspect="auto")
fig.show()

**z zaprezentowanych danych widać np: dużą korelację między 'review_scores_rating' a 'review_scores_accuracy', a niską korelację między np: 'review_scores_checkin' a 'review_scores_location'**