Goal: Create a cleaned development dataset you can use to complete the modeling step of your project.

Steps:
1)Create dummy or indicator features for categorical variables.
2)Standardize the magnitude of numeric features using a scaler.
3)Split into testing and training datasets.

In [25]:
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pprint
import numpy as np
import seaborn as sns
from scipy.stats import zscore
from sklearn import preprocessing
%matplotlib inline

In [26]:
df = pd.read_csv('Passanger_booking_data.csv')
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,1,Internet,RoundTrip,21,12,6,Tue,AKLHGH,Australia,0,0,0,7.21,1
1,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
2,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
3,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
4,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0


In [27]:
df.shape

(50002, 14)

In [28]:
df.columns

Index(['num_passengers', 'sales_channel', 'trip_type', 'purchase_lead',
       'length_of_stay', 'flight_hour', 'flight_day', 'route',
       'booking_origin', 'wants_extra_baggage', 'wants_preferred_seat',
       'wants_in_flight_meals', 'flight_duration', 'booking_complete'],
      dtype='object')

# Step1): Create dummy or indicator features for categorical variables

Create dummy variables for **trip_type**. Add the dummies back to the dataframe and remove the original column for **trip_type**

In [29]:
dfo=df['trip_type']
dummy_trip_type= pd.get_dummies(dfo)
dummy_trip_type
df=pd.concat([df,dummy_trip_type],axis=1)
del(df['trip_type'])
df.head()

Unnamed: 0,num_passengers,sales_channel,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,CircleTrip,OneWay,RoundTrip
0,1,Internet,21,12,6,Tue,AKLHGH,Australia,0,0,0,7.21,1,0,0,1
1,2,Internet,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0,0,0,1
2,1,Internet,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0,0,0,1
3,2,Internet,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0,0,0,1
4,1,Internet,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0,0,0,1


In [34]:
dfo=df['sales_channel']
dummy_sales_channel= pd.get_dummies(dfo)
dummy_sales_channel
df=pd.concat([df,dummy_sales_channel],axis=1)
del(df['sales_channel'])
df.head()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,CircleTrip,OneWay,RoundTrip,Internet,Mobile
0,1,21,12,6,Tue,AKLHGH,Australia,0,0,0,7.21,1,0,0,1,1,0
1,2,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0,0,0,1,1,0
2,1,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0,0,0,1,1,0
3,2,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0,0,0,1,1,0
4,1,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0,0,0,1,1,0


In [35]:
dfo=df['flight_day']
dummy_flight_day= pd.get_dummies(dfo)
dummy_flight_day
df=pd.concat([df,dummy_flight_day],axis=1)
del(df['flight_day'])
df.head()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,...,RoundTrip,Internet,Mobile,Fri,Mon,Sat,Sun,Thu,Tue,Wed
0,1,21,12,6,AKLHGH,Australia,0,0,0,7.21,...,1,1,0,0,0,0,0,0,1,0
1,2,262,19,7,AKLDEL,New Zealand,1,0,0,5.52,...,1,1,0,0,0,1,0,0,0,0
2,1,112,20,3,AKLDEL,New Zealand,0,0,0,5.52,...,1,1,0,0,0,1,0,0,0,0
3,2,243,22,17,AKLDEL,India,1,1,0,5.52,...,1,1,0,0,0,0,0,0,0,1
4,1,96,31,4,AKLDEL,New Zealand,0,0,1,5.52,...,1,1,0,0,0,1,0,0,0,0


In [37]:
dfo=df['route']
dummy_route= pd.get_dummies(dfo)
dummy_route
df=pd.concat([df,dummy_route],axis=1)
del(df['route'])
df.head()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,...,TGGXIY,TPETRZ,TPETWU,TPEURT,TPEVTE,TRZWUH,TRZXIY,TWUWUH,TWUXIY,URTXIY
0,1,21,12,6,Australia,0,0,0,7.21,1,...,0,0,0,0,0,0,0,0,0,0
1,2,262,19,7,New Zealand,1,0,0,5.52,0,...,0,0,0,0,0,0,0,0,0,0
2,1,112,20,3,New Zealand,0,0,0,5.52,0,...,0,0,0,0,0,0,0,0,0,0
3,2,243,22,17,India,1,1,0,5.52,0,...,0,0,0,0,0,0,0,0,0,0
4,1,96,31,4,New Zealand,0,0,1,5.52,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
dfo=df['booking_origin']
dummy_booking_origin= pd.get_dummies(dfo)
dummy_booking_origin
df=pd.concat([df,dummy_booking_origin],axis=1)
del(df['booking_origin'])
df.head()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,CircleTrip,...,Timor-Leste,Tonga,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Vanuatu,Vietnam
0,1,21,12,6,0,0,0,7.21,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,262,19,7,1,0,0,5.52,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,112,20,3,0,0,0,5.52,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,243,22,17,1,1,0,5.52,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,96,31,4,0,0,1,5.52,0,0,...,0,0,0,0,0,0,0,0,0,0


# Step2): Standardize the magnitude of numeric features using a scaler.

In [31]:
scaler = preprocessing.StandardScaler()

In [39]:
scaled_df = scaler.fit_transform(df)

In [44]:
scaled_df = pd.DataFrame(scaled_df)

In [49]:
from sklearn import preprocessing
no_response_var_data=df.drop(['purchase_lead','length_of_stay'], axis=1)
y=df.length_of_stay
scaler = preprocessing.StandardScaler().fit(no_response_var_data)
scaled=scaler.transform(no_response_var_data)

In [50]:
del(df['purchase_lead'])
y=df['length_of_stay']
no_response_var_data=df.drop(df.loc[:,["length_of_stay"]],axis=1)
scaled=preprocessing.StandardScaler().fit(no_response_var_data)
scaled
X_scaled=scaled.transform(no_response_var_data)

In [51]:
no_response_var_data

Unnamed: 0,num_passengers,flight_hour,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,CircleTrip,OneWay,RoundTrip,...,Timor-Leste,Tonga,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Vanuatu,Vietnam
0,1,6,0,0,0,7.21,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,7,1,0,0,5.52,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,3,0,0,0,5.52,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2,17,1,1,0,5.52,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,4,0,0,1,5.52,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49997,2,9,1,0,1,5.62,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
49998,1,4,0,0,0,5.62,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
49999,1,22,0,0,1,5.62,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
50000,1,11,1,0,1,5.62,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [52]:
no_response_var_data.columns

Index(['num_passengers', 'flight_hour', 'wants_extra_baggage',
       'wants_preferred_seat', 'wants_in_flight_meals', 'flight_duration',
       'booking_complete', 'CircleTrip', 'OneWay', 'RoundTrip',
       ...
       'Timor-Leste', 'Tonga', 'Tunisia', 'Turkey', 'Ukraine',
       'United Arab Emirates', 'United Kingdom', 'United States', 'Vanuatu',
       'Vietnam'],
      dtype='object', length=922)

In [53]:
no_response_var_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50002 entries, 0 to 50001
Columns: 922 entries, num_passengers to Vietnam
dtypes: float64(1), int64(6), uint8(915)
memory usage: 46.3 MB


# Step3): Split into testing and training datasets.


In [54]:
from sklearn.model_selection import train_test_split
y=y.ravel()
X_train, X_test, y_train, y_tes=train_test_split(X_scaled,y)
print(no_response_var_data.columns)

Index(['num_passengers', 'flight_hour', 'wants_extra_baggage',
       'wants_preferred_seat', 'wants_in_flight_meals', 'flight_duration',
       'booking_complete', 'CircleTrip', 'OneWay', 'RoundTrip',
       ...
       'Timor-Leste', 'Tonga', 'Tunisia', 'Turkey', 'Ukraine',
       'United Arab Emirates', 'United Kingdom', 'United States', 'Vanuatu',
       'Vietnam'],
      dtype='object', length=922)


# Conclusion:
Does my data set have any categorical data, such as Gender or day of the week?
*No, the data don't have any categorical data, such as(booking_origin, trip_type, etc...)*


Do my features have data values that range from 0 - 100 or 0-1 or both and more?
*Both.*