# Data Cleaning File

In [4]:
#Read in libraries
import dask.dataframe as dd
import swifter

import pandas as pd

import re

import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#supress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
#Set path to get aggregated Calendar data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\SF Airbnb Raw Data - Aggregated\SF_Calendar_Nov2018_Oct2019.csv'

#Parse dates
parse_dates = ['date']

#Read in calendar and reviews data
calendar = pd.read_csv(path, sep = '\t', parse_dates=parse_dates,index_col=0, low_memory=False)

## Calendar Data

In [6]:
#Original calendar shape
print('Original calendar shape:', calendar.shape)

calendar.dtypes

Original calendar shape: (15901917, 7)


adjusted_price            object
available                 object
date              datetime64[ns]
listing_id                 int64
maximum_nights           float64
minimum_nights           float64
price                     object
dtype: object

In [7]:
calendar.head(2)

Unnamed: 0,adjusted_price,available,date,listing_id,maximum_nights,minimum_nights,price
0,$80.00,f,2019-04-03,187730,120.0,3.0,$80.00
1,$80.00,f,2019-04-04,187730,120.0,3.0,$80.00


### Remove Duplicate Rows

In [8]:
# #Remove any duplicate rows
# calendar.drop_duplicates(inplace=True)

# #Updated calendar shape
# print('Updated calendar shape:', calendar.shape)

#### Calendar Outlier Removal(Price)

In [9]:
#Plot data
calendar.price.plot(kind = 'hist', figsize = (10,6.5),
                    legend = True, title = 'Price Distribution Calendar Data')

#Capture plot object
ax = plt.gca()

#Format x-axis
ax.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "${:,}".format(int(x))))

#Format grid lines
ax.grid(b = False, which ='major', axis = 'x')

TypeError: no numeric data to plot

**Identify and remove outliers using IQR

In [None]:
#Original Calendar Shape
print(calendar.shape)

In [None]:
#Calculate IQR of price
q25 = calendar['price'].quantile(0.25)
q75= calendar['price'].quantile(0.75)
iqr = q75 - q25

#Print percentiles
print('Percentiles: 25th={:.3f}, 75th={:.3f} \nIQR= {:.3f}'.format(q25, q75, iqr))

#Calculate outlier cutoffs
cut_off =1.5 * iqr
lower, upper = q25 - cut_off, q75 + iqr

#Identify outliers
outliers = [x for x in calendar.price if x < lower or x > upper]
print("Number of outliers identified: {}".format(len(outliers)))

#Remove outliers
outliers_removed = [x for x in calendar.price if x >= lower and x <= upper]
print('Non-outlier observations: {}'.format(len(outliers_removed)))

#Update df
calendar = calendar[calendar.price.isin(outliers_removed)]
calendar.shape

In [None]:
#Plot updated prices from calendar data
calendar.price.plot(kind = 'hist',  title = 'Calendar price distribution Updated', label = 'PRICE',
                   legend = True)

#Get plot object
ax = plt.gca()

#Capture mean and median
mean = np.mean(calendar.price)
median = np.median(calendar.price)

#Plot mean and median to histogram
ax.axvline(mean, color='k', linestyle='dashed', linewidth=2.5, label = "AVG ${:}".format(str(round(mean,2))))
ax.axvline(median, color='g', linestyle='solid', linewidth=2.5, label = "MED ${:}".format(str(round(median,2))))

#Format x-axis
ax.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "${:,}".format(int(x))))

#Format y ticks
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

#Mute vertical grid lines
ax.grid(b = False, which ='major', axis = 'x')

#Show legend
plt.legend(frameon = True, loc='top right')

In [None]:
#Set path to write listings
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\02_Intermediate\calendar_cleaned.csv'

#Write listings to path
calendar.to_csv(path, sep='\t')