### Determinants of Airbnb Prices in European Cities
#### 1. Import Data & Libraries 
#### 2. Dropping unnecessary columns
#### 3. Renaming columns
#### 4. Data Quality check
#### 5. Addressing Missing Values
#### 6. Addressing Duplicate values
#### 7. Exporting Dataframe

## 1.Import data & libraries ##

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
##Python shortcut to folder path##
path = r'C:\Users\hazem\Master Folder- Airbnb Prices in European Cities'

In [3]:
## Importing Data Set ##
df_Airbnb = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'AirBnb_Europe.csv'), index_col = False)

## 2.Dropping unnecessary Columns ##

In [4]:
df_Airbnb.shape

(51707, 21)

In [5]:
df_Airbnb.columns

Index(['city', 'days', 'realSum', 'room_type', 'room_shared', 'room_private',
       'person_capacity', 'host_is_superhost', 'multi', 'biz',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist',
       'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index',
       'rest_index_norm', 'lng', 'lat'],
      dtype='object')

In [6]:
## Dropping columns ##
df_Airbnb = df_Airbnb.drop(columns = ['room_shared', 'room_private', 'host_is_superhost', 'multi', 'biz',
                                      'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm'])

## 3.Renaming columns ##

In [7]:
df_Airbnb.rename(columns = {'realSum':'accommodation', 'guest_satisfaction_overall':'guest_rating', 'dist':'city_distance', 
                           'metro_dist':'metro_distance', 'lng':'longitude', 'lat':'latitude'}, inplace = True)

In [8]:
df_Airbnb.columns

Index(['city', 'days', 'accommodation', 'room_type', 'person_capacity',
       'cleanliness_rating', 'guest_rating', 'bedrooms', 'city_distance',
       'metro_distance', 'longitude', 'latitude'],
      dtype='object')

## 4.Data Quality check ##

In [9]:
df_Airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51707 entries, 0 to 51706
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                51707 non-null  object 
 1   days                51707 non-null  object 
 2   accommodation       51707 non-null  int64  
 3   room_type           51707 non-null  object 
 4   person_capacity     51707 non-null  int64  
 5   cleanliness_rating  51707 non-null  int64  
 6   guest_rating        51707 non-null  int64  
 7   bedrooms            51707 non-null  int64  
 8   city_distance       51707 non-null  float64
 9   metro_distance      51707 non-null  float64
 10  longitude           51707 non-null  float64
 11  latitude            51707 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 4.7+ MB


In [10]:
df_Airbnb.describe()

Unnamed: 0,accommodation,person_capacity,cleanliness_rating,guest_rating,bedrooms,city_distance,metro_distance,longitude,latitude
count,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0
mean,279.886398,3.161661,9.390624,92.628232,1.15876,3.191628,0.681649,7.426068,45.671128
std,327.947159,1.298545,0.954868,8.945531,0.62741,2.394007,0.858607,9.799725,5.249263
min,35.0,2.0,2.0,20.0,0.0,0.0,0.0,-9.22634,37.953
25%,149.0,2.0,9.0,90.0,1.0,1.5,0.2,-0.0725,41.39951
50%,211.0,3.0,10.0,95.0,1.0,2.6,0.4,4.873,47.50669
75%,320.0,4.0,10.0,99.0,1.0,4.3,0.7,13.518825,51.471885
max,18545.0,6.0,10.0,100.0,10.0,25.3,14.3,23.78602,52.64141


## 5.Addressing Missing Values ##

In [11]:
df_Airbnb.isnull().sum()

city                  0
days                  0
accommodation         0
room_type             0
person_capacity       0
cleanliness_rating    0
guest_rating          0
bedrooms              0
city_distance         0
metro_distance        0
longitude             0
latitude              0
dtype: int64

No missing value found

## 6.Addressing Duplicate values ##

In [12]:
df_Airbnb_dups = df_Airbnb[df_Airbnb.duplicated()]

In [13]:
df_Airbnb_dups

Unnamed: 0,city,days,accommodation,room_type,person_capacity,cleanliness_rating,guest_rating,bedrooms,city_distance,metro_distance,longitude,latitude
993,Amsterdam,Weekdays,332,Private room,2,10,99,1,3.7,0.5,4.872,52.343
2303,Athens,Weekdays,82,Private room,2,10,99,1,1.5,0.5,23.730,37.988
2579,Athens,Weekdays,151,Entire home/apt,6,10,100,2,3.2,0.6,23.761,37.996
3104,Athens,Weekdays,110,Entire home/apt,3,10,97,0,1.0,0.3,23.732,37.984
3555,Athens,Weekdays,185,Entire home/apt,6,10,100,1,1.8,0.2,23.719,37.986
...,...,...,...,...,...,...,...,...,...,...,...,...
50740,Vienna,Weekends,290,Entire home/apt,2,10,100,0,2.8,0.2,16.408,48.218
50743,Vienna,Weekends,336,Entire home/apt,4,10,100,1,2.8,0.2,16.408,48.218
50744,Vienna,Weekends,336,Entire home/apt,4,10,100,1,2.8,0.2,16.408,48.218
51452,Vienna,Weekends,319,Entire home/apt,3,10,100,1,2.3,0.2,16.356,48.191


In [14]:
## Dropping Duplicates ##
df_Airbnb_clean = df_Airbnb.drop_duplicates()

In [15]:
df_Airbnb_clean.shape

(51611, 12)

## 7.Exporting Dataframe ##

In [16]:
df_Airbnb_clean.to_csv(os.path.join(path, '02 Data', 'Original Data', 'Airbnb_Europe_Clean.csv'))