In [3]:
# model.py

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pickle

# Load the dataset
data = sns.load_dataset('tips')

# Mapping categorical variables manually
sex_mapping = {'Male': 0, 'Female': 1}
smoker_mapping = {'No': 0, 'Yes': 1}
day_mapping = {'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3}
time_mapping = {'Lunch': 0, 'Dinner': 1}

data['sex'] = data['sex'].map(sex_mapping)
data['smoker'] = data['smoker'].map(smoker_mapping)
data['day'] = data['day'].map(day_mapping)
data['time'] = data['time'].map(time_mapping)

# Features and target variable
X = data.drop(columns=['tip'])
y = data['tip']

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Save the model
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


Mean Absolute Error: 0.7715816326530615


In [4]:
X

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,1,0,3,1,2
1,10.34,0,0,3,1,3
2,21.01,0,0,3,1,3
3,23.68,0,0,3,1,2
4,24.59,1,0,3,1,4
...,...,...,...,...,...,...
239,29.03,0,0,2,1,3
240,27.18,1,1,2,1,2
241,22.67,0,1,2,1,2
242,17.82,0,0,2,1,2
