In [69]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import datetime

In [70]:
# Read in CSV file
csv = Path("travel_tourism_dataset.csv")
travel_df = pd.read_csv(csv)
travel_df.head()

Unnamed: 0,Trip #no.,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel
0,55,1,20111,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13
1,288,5,29681,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04
2,291,7,24950,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25
3,131,19,18676,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28
4,281,28,18899,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21


In [71]:
# Verify no null values
travel_df.count()

Trip #no.                      3000
Duration                       3000
Cost of Travel(Entire Trip)    3000
Mode of Travel                 3000
Stay                           3000
First Name                     3000
Last Name                      3000
Date of Birth                  3000
Address                        3000
Age                            3000
Sex                            3000
Nationality                    3000
Date of Travel                 3000
dtype: int64

In [72]:
# Check datatypes
travel_df.dtypes

Trip #no.                       int64
Duration                        int64
Cost of Travel(Entire Trip)     int64
Mode of Travel                 object
Stay                           object
First Name                     object
Last Name                      object
Date of Birth                  object
Address                        object
Age                             int64
Sex                            object
Nationality                    object
Date of Travel                 object
dtype: object

In [73]:
# Change Dates to Datetime
travel_df["Date of Travel"] = pd.to_datetime(travel_df["Date of Travel"])
travel_df["Date of Birth"] = pd.to_datetime(travel_df["Date of Birth"])
travel_df.dtypes

Trip #no.                               int64
Duration                                int64
Cost of Travel(Entire Trip)             int64
Mode of Travel                         object
Stay                                   object
First Name                             object
Last Name                              object
Date of Birth                  datetime64[ns]
Address                                object
Age                                     int64
Sex                                    object
Nationality                            object
Date of Travel                 datetime64[ns]
dtype: object

In [74]:
# Drop Trip #no.
travel_df = travel_df.drop("Trip #no.", axis=1)
travel_df.head()

Unnamed: 0,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel
0,1,20111,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13
1,5,29681,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04
2,7,24950,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25
3,19,18676,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28
4,28,18899,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21


In [75]:
# Add column for cost per day
travel_df["Cost per Day"] = travel_df["Cost of Travel(Entire Trip)"] / travel_df["Duration"]
travel_df.head()

Unnamed: 0,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel,Cost per Day
0,1,20111,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13,20111.0
1,5,29681,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04,5936.2
2,7,24950,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25,3564.285714
3,19,18676,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28,982.947368
4,28,18899,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21,674.964286


In [76]:
# Reformat cost columns
travel_df["Cost of Travel(Entire Trip)"] = travel_df["Cost of Travel(Entire Trip)"].astype(float)
pd.options.display.float_format = '{:.2f}'.format
travel_df.head()

Unnamed: 0,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel,Cost per Day
0,1,20111.0,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13,20111.0
1,5,29681.0,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04,5936.2
2,7,24950.0,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25,3564.29
3,19,18676.0,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28,982.95
4,28,18899.0,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21,674.96


In [80]:
# Extract ZIP Codes from Address
travel_df["ZIP"] = travel_df["Address"].str[-5:]
travel_df.head()

Unnamed: 0,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel,Cost per Day,ZIP
0,1,20111.0,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13,20111.0,71371
1,5,29681.0,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04,5936.2,27800
2,7,24950.0,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25,3564.29,32510
3,19,18676.0,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28,982.95,26739
4,28,18899.0,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21,674.96,46541


In [81]:
# Export cleaned data to csv
travel_df.to_csv("cleaned_data.csv", index=False)