In [1]:
# Import the modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.model_selection import train_test_split

### Split the Data into Training and Testing Sets.

### Step 1: Read the csv data from the Resources folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data_df = pd.read_csv('../Resources/cleaned_df.csv')

# Review the DataFrame
lending_data_df.head()

Unnamed: 0,Duration,Cost of Travel(Entire Trip) in Euros,Mode of Travel,Stay,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel,year,month,day,Year,Month,Day
0,1,20111,Flight,Hotel,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13,2022,9,13,2022,9,13
1,5,29681,Car,Hotel,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04,2021,9,4,2021,9,4
2,7,24950,Flight,Airbnb,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25,2021,1,25,2021,1,25
3,19,18676,Roadtrip,Airbnb,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28,2022,1,28,2022,1,28
4,28,18899,Flight,Airbnb,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21,2021,10,21,2021,10,21


### Step 2: Create the labels set (y) from the “Stay” column, and then create the features (X) DataFrame from the remaining columns.

In [4]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_data_df['Stay']

# Separate the X variable, the features
X = lending_data_df.drop("Stay", axis=1)

In [5]:
# Review the y variable Series
y[:5]

0     Hotel
1     Hotel
2    Airbnb
3    Airbnb
4    Airbnb
Name: Stay, dtype: object

In [6]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Duration,Cost of Travel(Entire Trip) in Euros,Mode of Travel,First Name,Last Name,Date of Birth,Address,Age,Sex,Nationality,Date of Travel,year,month,day,Year,Month,Day
0,1,20111,Flight,Cristina,Fuentes,1914-06-14,"5143 Pope Camp Apt. 028\nLake Emily, UT 71371",74,Non-Binary,Lao People's Democratic Republic,2022-09-13,2022,9,13,2022,9,13
1,5,29681,Car,Patricia,Young,1923-09-25,716 Dominguez Row Suite 775\nSouth Tiffanyboro...,19,Female,Gabon,2021-09-04,2021,9,4,2021,9,4
2,7,24950,Flight,Christina,Kirby,1935-01-02,"PSC 4600, Box 1237\nAPO AE 32510",36,Female,Syrian Arab Republic,2021-01-25,2021,1,25,2021,1,25
3,19,18676,Roadtrip,Michael,Hudson,1982-04-21,"7013 Bryant Club Suite 695\nSouth Tina, WI 26739",38,Non-Binary,Afghanistan,2022-01-28,2022,1,28,2022,1,28
4,28,18899,Flight,Francisco,Hensley,1913-06-29,"074 Scott Brook Suite 700\nLukeville, VA 46541",38,Non-Binary,Uruguay,2021-10-21,2021,10,21,2021,10,21


In [7]:
# Check the balance of target values
y.value_counts()

Stay
Other      787
Airbnb     786
Hotel      724
Resorts    703
Name: count, dtype: int64

### Step 3: Split the data into training and testing datasets by using train_test_split.

In [8]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

### Create a Logistic Regression Model with the Original Data.

### Step 1: Fit a logistic regression model by using the training data (X_train and y_train).

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'Roadtrip'

### Step 2: Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Predictions": predictions,"Acutal": y_test})

#### Step 3: Evaluate the model’s performance by doing the following:
- Generate a confusion matrix.

- Print the classification report.

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))