# Decision Trees (Regression)

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# Import dataset
df = pd.read_csv("https://github.com/mwaskom/seaborn-data/blob/master/tips.csv?raw=True")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
# Delete smoker variable
df.drop('smoker', axis=1, inplace=True)

In [4]:
# Convert non-numeric data using one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'day', 'time'], dtype=int)

In [5]:
# Assign X and y variables
X = df.drop('tip', axis=1)
y = df['tip']

# Split data into test/train set (70/30 split) and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((170, 10), (74, 10), (170,), (74,))

In [6]:
# Assign algorithm
model = DecisionTreeRegressor()

# Link algorithm to X and y variables
model.fit(X_train, y_train)

In [7]:
# Check prediction error for training and test data using MAE
y_pred_train = model.predict(X_train)
print(f"Train MAE: {mean_absolute_error(y_train, y_pred_train):.3f}")

y_pred_test = model.predict(X_test)
print(f"Test MAE: {mean_absolute_error(y_test, y_pred_test):.3f}")

Train MAE: 0.006
Test MAE: 1.173


In [8]:
# Data point to predict
test_point = [
    21,  # total_bill
    3,  # size
    0,  # sex_Female
    1,  # sex_Male
    0,  # day_Fri
    1,  # day_Sat
    0,  # day_Sun
    0,  # day_Thur
    1,  # time_Dinner
    0,  # time_Lunch
]

# Make prediction
model.predict([test_point])



array([3.35])