In [58]:
# Import the modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import hvplot.pandas
from sklearn.linear_model import LinearRegression


In [59]:
# Read the CSV file from the Resources folder and create a Pandas DataFrame
Greece_data_df = pd.read_csv("Resources/travel_tourism_dataset.csv")


In [60]:
# Review the DataFrame showing the first five rows
Greece_data_df.head()

Unnamed: 0,Trip #no.,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel
0,55,1,20111,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13
1,288,5,29681,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04
2,291,7,24950,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25
3,131,19,18676,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28
4,281,28,18899,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21


In [61]:
# Create a scatter plot of Cost of Travel versus Age
CostVersusAge_plot = Greece_data_df.hvplot.scatter(
    x="Age",
    y="Cost of Travel(Entire Trip)",
    title="Cost of Travel based on Age"
)
CostVersusAge_plot

In [62]:
# Reformat data of the independent variable X as a single-column array
X = Greece_data_df["Age"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[74],
       [19],
       [36],
       [38],
       [38]], dtype=int64)

In [63]:
# The shape of X is 3000 rows
X.shape

(3000, 1)

In [64]:
# Create an array for the dependent variable y
y = Greece_data_df["Cost of Travel(Entire Trip)"]

In [65]:
# Create an array for the dependent variable y
y = Greece_data_df["Cost of Travel(Entire Trip)"]

In [66]:
#Building the Linear Regression Model an individual at age 25***

In [67]:
# Create a model with scikit-learn
model = LinearRegression()

In [68]:
# Fit the data into the model
model.fit(X, y)

In [69]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [-13.64494824]


In [70]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 18373.927203522006


In [71]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 18373.927203522006 + -13.644948244833229X


In [72]:
# Display the formula to predict the Cost of Travel for Entire Trip for a person at age 25 
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 25")

# Predict the salary for a person with 7 years of experience
y_7 = model.intercept_ + model.coef_[0] * 25

# Display the prediction
print(f"Predicted Cost of entire trip for someone age 25: ${y_7:.2f}")

Model's formula: y = 18373.927203522006 + -13.644948244833229 * 25
Predicted Cost of entire trip for someone age 25: $18032.80


In [73]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [74]:
# Create a copy of the original data
df_CostofTrip = Greece_data_df.copy()

# Add a column with the predicted salary values
df_CostofTrip["Cost of Travel(Entire Trip)"] = predicted_y_values

# Display sample data
df_CostofTrip.head()

Unnamed: 0,Trip #no.,Duration,Cost of Travel(Entire Trip),Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel
0,55,1,17364.201033,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13
1,288,5,18114.673187,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04
2,291,7,17882.709067,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25
3,131,19,17855.41917,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28
4,281,28,17855.41917,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21


In [75]:
# Create a line plot of Age versus the predicted Cost of Travel
best_fit_line = df_CostofTrip.hvplot.line(
    x = "Age",
    y = "Cost of Travel(Entire Trip)",
    color = "red"
)
best_fit_line

In [76]:
# Superpose the original data and the best fit line
CostVersusAge_plot * best_fit_line

In [77]:
#show the slope of CostVersusAge_plot
print(f"Model's slope: {model.coef_}")


Model's slope: [-13.64494824]


In [78]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [79]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.0009647973080727512.
The r2 is 0.0009647973080727512.
The mean squared error is 53003961.53471895.
The root mean squared error is 7280.381963518051.
The standard deviation is 7283.8965533327855.
