# Dependencies

In [16]:
# Import the dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.model_selection import train_test_split


### Exploring and cleaning the dataset

In [17]:
# Read the CSV 

# Read the CSV file from the Resources folder into a Pandas DataFrame
travel_df = pd.read_csv('./Resources/travel_tourism_dataset.csv')

# Review the DataFrame
travel_df.head()


Unnamed: 0,Trip #no.,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel
0,55,1,20111,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13
1,288,5,29681,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04
2,291,7,24950,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25
3,131,19,18676,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28
4,281,28,18899,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21


In [18]:
# Review the DataFrame
travel_df.shape

(3000, 13)

In [19]:
# Split the date column into year, month, and day columns
travel_df[['year', 'month', 'day']] = travel_df['Date of Travel'].str.split('-', expand=True)

# Convert the new columns to integer type
travel_df['Year'] = travel_df['year'].astype(int)
travel_df['Month'] = travel_df['month'].astype(int)
travel_df['Day'] = travel_df['day'].astype(int)

In [20]:
# Drop columns 'Trip #no.' from the DataFrame
cleaned_df = travel_df.drop(['Trip #no.', 'year', 'month', 'day'], axis=1)

In [21]:
# Rename the 'Cost of Travel(Entire Trip)' to 'Cost of Travel(Entire Trip) in Euros'
cleaned_df = cleaned_df.rename(columns={'Cost of Travel(Entire Trip)': 'Cost of Travel(Entire Trip) in Euros'})

In [22]:
# Review data types
print(cleaned_df.dtypes)

Duration                                 int64
Cost of Travel(Entire Trip) in Euros     int64
Mode of Travel                          object
Stay                                    object
First Name                              object
Last Name                               object
Date of Birth                           object
Address                                 object
Age                                      int64
Sex                                     object
Nationality                             object
Date of Travel                          object
Year                                     int32
Month                                    int32
Day                                      int32
dtype: object


In [23]:
# Add column for cost per day
cleaned_df["Cost per Day"] = cleaned_df["Cost of Travel(Entire Trip) in Euros"] / cleaned_df["Duration"]

# Reformat cost columns
cleaned_df["Cost of Travel(Entire Trip) in Euros"] = cleaned_df["Cost of Travel(Entire Trip) in Euros"].astype(float)
pd.options.display.float_format = '{:.2f}'.format

In [24]:
# Extract ZIP Codes from Address
cleaned_df["ZIP"] = cleaned_df["Address"].str[-5:]

In [25]:
# Add here further cleaning



In [26]:
cleaned_df.head()

Unnamed: 0,Duration,Cost of Travel(Entire Trip) in Euros,Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel,Year,Month,Day,Cost per Day,ZIP
0,1,20111.0,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13,2022,9,13,20111.0,71371
1,5,29681.0,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04,2021,9,4,5936.2,27800
2,7,24950.0,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25,2021,1,25,3564.29,32510
3,19,18676.0,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28,2022,1,28,982.95,26739
4,28,18899.0,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21,2021,10,21,674.96,46541


### Export the cleaned dataset into a new csv file

In [27]:
# Export the DataFrame to a CSV file
cleaned_df.to_csv('./Resources/cleaned_df.csv', index=False) 

#### Visualizations

### Model