### LINEAR REGRESSION

In [6]:
# write a function that tells whether someone is an adult, a teenager or a child

def age_group(age: int) -> str:
    """
    This function accepts someone's age in integer and
    returns a string output of either child, teenager or adult.
    """
    if age < 13:
        print('child')
    elif age < 20:
        print('teenager')
    else:
        print('adult')


In [23]:
# libraries for eda 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# ml libraries
from sklearn.linear_model import LinearRegression # fit the model
from sklearn.preprocessing import LabelEncoder,StandardScaler # data preprocessing
from sklearn.metrics import root_mean_squared_error # model evaluation.
from sklearn.model_selection import train_test_split


In [38]:
# load the dataset

tips_df = px.data.tips()
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


#### DATA WRANGLING

In [39]:
# encode the categorical variables
label_encoders = {}

categorical_columns = tips_df.select_dtypes(include="object").columns
for column in categorical_columns:
    encoder = LabelEncoder() # init the encoder object
    tips_df[column] = encoder.fit_transform(tips_df[column])
    label_encoders.update({column:encoder})


tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [40]:
label_encoders['day'].inverse_transform([2])

array(['Sun'], dtype=object)

In [41]:
# split the dataset.
X = tips_df.drop(columns=['tip'])
y = tips_df['tip']

# split both X and y into train and test set.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,
                                                    random_state=23)

for split in [X_train, X_test, y_train, y_test]:
    print(split.shape)

(195, 6)
(49, 6)
(195,)
(49,)


In [42]:
# scale the dataset.

scaler = StandardScaler()
column_names = X_train.columns
X_train = scaler.fit_transform(X_train) # fit the scaler object to the training data
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=column_names)
X_test = pd.DataFrame(data=X_test, columns=column_names)

X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,-0.168789,-1.43069,-0.825248,1.399223,-0.603023,-0.595055
1,0.326705,0.698963,1.211757,0.316491,-0.603023,-0.595055
2,-0.227595,0.698963,-0.825248,-0.766241,-0.603023,-0.595055
3,-0.579342,0.698963,1.211757,-0.766241,-0.603023,-0.595055
4,-0.262443,0.698963,1.211757,-0.766241,-0.603023,-0.595055


### model training 

In [44]:
# init the model

model = LinearRegression()
model.fit(X_train, y_train) # fitting the data to the model
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

### Model Evaluation

In [45]:
train_preds[:5]

array([2.70284574, 3.09485889, 2.76195015, 2.39936559, 2.66417991])

In [46]:
y_train.values[:5]

array([3.  , 5.65, 3.76, 2.09, 3.08])

In [48]:
train_rmse = root_mean_squared_error(y_true=y_train, y_pred=train_preds)
test_rmse = root_mean_squared_error(y_true=y_test, y_pred=test_preds)

print(train_rmse)
print(test_rmse)

1.032955450797039
0.9254946879968747


In [62]:
def make_prediction(**kwargs):
    kwargs = {key:[value] for key, value in kwargs.items()}
    data = pd.DataFrame(data=kwargs)

    # encode the categorical variables
    for column in categorical_columns:
        data[column] = label_encoders[column].transform(data[column])
    
    # scale the dataset
    columns = data.columns
    data = scaler.transform(data)
    data = pd.DataFrame(data=data, columns=columns)

    # make prediction
    prediction = model.predict(data)

    
    return prediction.item()


make_prediction(total_bill = 250,
                sex='Male',smoker='No',day='Fri',time='Dinner',
                size=5)


24.338456150860903