# Project 3: Predicting new Airbnb users' first destinations
# Part 2: Data Cleaning and Exporting to PostgreSQL

In [127]:
import pandas as pd
import matplotlib.pyplot as plt
from patsy import dmatrix
import datetime
import numpy as np

%matplotlib inline

## Load Data

In [63]:
age_gender_bkts = pd.read_csv('./airbnb firstdestinations/age_gender_bkts.csv')

In [6]:
countries = pd.read_csv('./airbnb firstdestinations/countries.csv')

In [8]:
sessions = pd.read_csv('./airbnb firstdestinations/sessions.csv')

In [167]:
users = pd.read_csv('./airbnb firstdestinations/train_users.csv')

In [12]:
age_gender_bkts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 5 columns):
age_bucket                 420 non-null object
country_destination        420 non-null object
gender                     420 non-null object
population_in_thousands    420 non-null float64
year                       420 non-null float64
dtypes: float64(2), object(3)
memory usage: 16.5+ KB


In [81]:
countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
country_destination              10 non-null object
lat_destination                  10 non-null float64
lng_destination                  10 non-null float64
distance_km                      10 non-null float64
destination_km2                  10 non-null float64
destination_language             10 non-null object
language_levenshtein_distance    10 non-null float64
dtypes: float64(5), object(2)
memory usage: 640.0+ bytes


In [87]:
sessions.info(null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10567737 entries, 0 to 10567736
Data columns (total 6 columns):
user_id          10533241 non-null object
action           10488111 non-null object
action_type      9441533 non-null object
action_detail    9441533 non-null object
device_type      10567737 non-null object
secs_elapsed     10431706 non-null float64
dtypes: float64(1), object(5)
memory usage: 483.8+ MB


- missing user_id s (delete?)
- missing actions
- missing acction_types & action_detail
- missing secs_elapsed (delete?)

In [85]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id                         213451 non-null object
date_account_created       213451 non-null object
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null object
gender                     213451 non-null object
age                        125461 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 26.1+ MB


- Missing age data
- missing gender data
- missing date_first_active data (going to delete this because if we're predicting the first booking, we won't know the date)
- missing first_affiliate_tracked

In [177]:
users = users.drop('date_first_booking', axis = 1)

In [74]:
countries

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610.0,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
9,US,36.966427,-95.84403,0.0,9826675.0,eng,0.0


In [67]:
sessions

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0
5,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,7703.0
6,d1mm9tcy42,lookup,,,Windows Desktop,115.0
7,d1mm9tcy42,personalize,data,wishlist_content_update,Windows Desktop,831.0
8,d1mm9tcy42,index,view,view_search_results,Windows Desktop,20842.0
9,d1mm9tcy42,lookup,,,Windows Desktop,683.0


Possible features:
- total secs_elapsed per user
- total secs_elapsed per user per device
- number of sessions per user
- (number of actions per user)

In [168]:
users.head(20)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US


### Cleaning dates, gender, browser and affiliates

Clean:
- dates to datetimes
- unknowns to NaNs in gender, first_browser and untracked to NaNs in first_affiliate_tracked

In [169]:
users['date_account_created'] = pd.to_datetime(users['date_account_created'], infer_datetime_format = True)
users['timestamp_first_active'] = pd.to_datetime(users['timestamp_first_active'], format = '%Y%m%d%H%M%S')

In [170]:
users.gender = users.gender.replace('-unknown-', np.nan)
users.first_browser = users.first_browser.replace('-unknown-', np.nan)
users.first_affiliate_tracked = users.first_affiliate_tracked.replace('untracked',np.nan)

In [171]:
for col in users:
    print(col)
    print(users[col].describe())
    print('-----------------')

id
count         213451
unique        213451
top       9skfdwov2q
freq               1
Name: id, dtype: object
-----------------
date_account_created
count                  213451
unique                   1634
top       2014-05-13 00:00:00
freq                      674
first     2010-01-01 00:00:00
last      2014-06-30 00:00:00
Name: date_account_created, dtype: object
-----------------
timestamp_first_active
count                  213451
unique                 213451
top       2013-07-01 05:26:34
freq                        1
first     2009-03-19 04:32:55
last      2014-06-30 23:58:24
Name: timestamp_first_active, dtype: object
-----------------
date_first_booking
count          88908
unique          1976
top       2014-05-22
freq             248
Name: date_first_booking, dtype: object
-----------------
gender
count     117763
unique         3
top       FEMALE
freq       63041
Name: gender, dtype: object
-----------------
age
count    125461.000000
mean         49.668335
std         1

In [172]:
users.gender.value_counts()

FEMALE    63041
MALE      54440
OTHER       282
Name: gender, dtype: int64

### Cleaning age data

Airbnb age regulations:

'You must be 18 or older to create an account. In order to use the Airbnb site and services, you must be 18 years or older. It's against our Terms of Service to create an account to travel or host unless you're at least 18 years old.'

In [175]:
sorted(users.age.unique())

[nan,
 0.0,
 1.0,
 6.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 115.0]

- If age is less than 18, replace with NaN.
- If age is greater than 150, it's in year of birth format, so take value from 2014 (this data is from 2014).
- If age is less than 150, but greater than 122 (longest known human lifespan), replace with NaN. 

In [174]:
users.loc[users.age < 18,'age'] = np.nan
users.loc[users.age > 150, 'age'] = 2014 - users.age
users.loc[users.age > 122, 'age'] = np.nan

In [178]:
users

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,,,facebook,0,en,direct,direct,,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,MALE,38.0,facebook,0,en,seo,google,,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,FEMALE,56.0,basic,3,en,direct,direct,,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,FEMALE,42.0,facebook,0,en,direct,direct,,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,,41.0,basic,0,en,direct,direct,,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,2010-01-01 21:56:19,,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,2010-01-02 01:25:58,FEMALE,46.0,basic,0,en,other,craigslist,,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,2010-01-03 19:19:05,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US
8,a1vcnhxeij,2010-01-04,2010-01-04 00:42:11,FEMALE,50.0,basic,0,en,other,craigslist,,Web,Mac Desktop,Safari,US
9,6uh8zyj2gn,2010-01-04,2010-01-04 02:37:58,,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US


### Remove users with 'No Destination Found'
We're interested in predicting first destinations, not whether or not they have booked yet.

In [188]:
users.country_destination.value_counts()

NDF      124543
US        62376
other     10094
FR         5023
IT         2835
GB         2324
ES         2249
CA         1428
DE         1061
NL          762
AU          539
PT          217
Name: country_destination, dtype: int64

In [189]:
users = users[users.country_destination != 'NDF']

In [190]:
users.country_destination.value_counts()

US       62376
other    10094
FR        5023
IT        2835
GB        2324
ES        2249
CA        1428
DE        1061
NL         762
AU         539
PT         217
Name: country_destination, dtype: int64

In [239]:
sessions.device_type.value_counts()

Mac Desktop                         3585886
Windows Desktop                     2648521
iPhone                              2096749
Android Phone                        835991
iPad Tablet                          681836
Android App Unknown Phone/Tablet     272820
-unknown-                            210059
Tablet                               139859
Linux Desktop                         27968
Chromebook                            22272
iPodtouch                              8186
Windows Phone                          2047
Blackberry                              979
Opera Phone                              68
Name: device_type, dtype: int64

## Load into PostgreSQL

In [236]:
from sqlalchemy import create_engine
cnx = create_engine('postgresql://emilygeller:p@54.173.47.58:5432/emilygeller')

In [194]:
#countries.to_sql(name = 'countries', con = cnx, if_exists = 'replace', index = False)
#sessions.to_sql(name = 'sessions', con=  cnx, if_exists = 'replace', index = False)
#users.to_sql(name = 'users', con = cnx, if_exists = 'replace', index = False)
#age_gender_bkts.to_sql(name = 'age_gender_bkts', con = cnx, if_exists = 'replace', index = False)

OperationalError: (psycopg2.OperationalError) server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
 [SQL: 'select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s'] [parameters: {'name': 'countries'}]

In [207]:
sessions = sessions[sessions.user_id.isnull()==False]

In [227]:
from tqdm import tqdm

In [229]:
#batch_size = 1000
#sessions_len = sessions.shape[0]
#for i in tqdm(range(sessions_len//batch_size)):
#    print('-----------')
#    print(i*batch_size, 'to', (i+1)*batch_size-1)
#    print('-----------')
#    sessions.iloc[i*batch_size:(i+1)*batch_size-1,:].to_sql(name = 'sessions', con = cnx, if_exists = 'append', index = False)
#    cursor = (i+1)*batch_size-1
#    print('batch done')
#
#sessions.iloc[cursor:sessions_len,:].to_sql(name = 'sessions', con = cnx, if_exists = 'append', index = False)
#print('done!')

In [216]:
sessions.shape[0]

10533241

In [231]:
sessions.to_csv('sessions_cleaned.csv')

In [232]:
users.to_csv('users_cleaned.csv')

In [234]:
sessions.dtypes

user_id           object
action            object
action_type       object
action_detail     object
device_type       object
secs_elapsed     float64
dtype: object

At this point I exported my dataframes to CSV, converted them to TAR, sent them to my AWS instance and created a PostgreSQL database with them. I tried to do it all from the Jupyter notebook but it was going to take 300 hours for one of the tables!

In [237]:
pd.read_sql('select * from users limit 5;', cnx)

Unnamed: 0,id,user_id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,FEMALE,56.0,basic,3,en,direct,direct,,Web,Windows Desktop,IE,US
1,3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,FEMALE,42.0,facebook,0,en,direct,direct,,Web,Mac Desktop,Firefox,other
2,4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,,41.0,basic,0,en,direct,direct,,Web,Mac Desktop,Chrome,US
3,5,osr2jwljor,2010-01-01,2010-01-01 21:56:19,,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
4,6,lsw9q7uk0j,2010-01-02,2010-01-02 01:25:58,FEMALE,46.0,basic,0,en,other,craigslist,,Web,Mac Desktop,Safari,US


In [238]:
pd.read_sql('select * from sessions limit 5;',cnx)

Unnamed: 0,id,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


## Other

In [75]:
train_users.country_destination.value_counts()

NDF      124543
US        62376
other     10094
FR         5023
IT         2835
GB         2324
ES         2249
CA         1428
DE         1061
NL          762
AU          539
PT          217
Name: country_destination, dtype: int64

In [41]:
cntry = dmatrix("country_destination", age_gender_bkts, return_type = 'dataframe')

In [44]:
age_gender_bkts = age_gender_bkts.join(cntry)

#### Sessions Actions Exploration

In [None]:
def transform_action
# take string, return string if value count > amount.. return other if <

In [73]:
sessions.action_type.value_counts()

view                3560902
data                2103770
click               1996183
-unknown-           1031170
submit               623357
message_post          87103
partner_callback      19132
booking_request       18773
modify                 1139
booking_response          4
Name: action_type, dtype: int64

In [72]:
sessions.action.value_counts()#['active']

show                           2768278
index                           843699
search_results                  725226
personalize                     706824
search                          536057
ajax_refresh_subtotal           487744
update                          365130
similar_listings                364624
social_connections              339000
reviews                         320591
active                          188036
similar_listings_v2             168788
lookup                          162041
create                          155887
dashboard                       152952
header_userpic                  141830
collections                     124417
edit                            109083
campaigns                       105028
track_page_view                  81117
unavailabilities                 78317
qt2                              64651
notifications                    59392
confirm_email                    58726
requested                        57034
identity                 