# Data cleaning and data wrangling on Listing Reviews database
## 1. Loading data

In [1]:
# import pandas
import pandas as pd

In [2]:
# read data from a zipped file to a dataframe
df = pd.read_csv('reviews.csv.gz', compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False, low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1153623 entries, 0 to 1153622
Data columns (total 6 columns):
listing_id       1153623 non-null int64
id               1153623 non-null int64
date             1153623 non-null object
reviewer_id      1153623 non-null int64
reviewer_name    1153623 non-null object
comments         1153295 non-null object
dtypes: int64(3), object(3)
memory usage: 52.8+ MB


### 2. Dealing with missing values
Only 'comments' column has missing values. They represent about 0.03% of the data and it is safe to drop them.

In [4]:
df = df.dropna()

### 3. Data Transformation
I will change date column to datetime format and add one more column indicating if the comment is an automated message informing about host cancelling the reservation.

In [5]:
# transform date column to datetime type
df.loc[:,'date'] = pd.to_datetime(df.loc[:,'date'])

In [6]:
df['canceled_res'] = df.comments.str.startswith('The host canceled')

Lastly, I will drop the listings that were dropped from general listing database to keep information consistent.

In [7]:
# load listing ids list
ids = pd.read_csv('listing_ids.csv', header = None)
ids = ids.drop(0, axis=1)

In [8]:
# merge data frames
reviews_cleaned = ids.merge(df, left_on=1, right_on='listing_id')
reviews_cleaned = reviews_cleaned.drop(1, axis = 1)

In [9]:
# write data to file
reviews_cleaned.to_csv('reviews_cleaned.csv.gz', compression='gzip')