In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset("tips")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [4]:
# eda scaling, encoding missing
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])

In [5]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [6]:
# aim --> to a predict time
df.time.unique()   #dinner -->0 and lunch -->1

array([0, 1])

In [7]:
X= df.drop('time', axis=1)
y=df['time']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20 , random_state=1)

In [9]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [10]:
# handling missing value
# data encoding
# feature scaling

from sklearn.impute import SimpleImputer    #fro missing values
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# pipeline --> a sequence of data transformation
# column transformer --> groups all the pipeline steps for each of the columns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [11]:
# iam just separate categorcal and numerical column

cat_cols = ['sex','smoker','day']
num_cols = ['total_bill','tip','size']

In [12]:
# feature engineering automation using pipeline and column transformer

num_pipeline = Pipeline(steps = [("imputation", SimpleImputer(strategy= 'median'))
                                 ,('scaling',StandardScaler() )])
cat_pipeline = Pipeline(steps = [("imputation", SimpleImputer(strategy= 'most_frequent'))
                                 ,('encoding',OneHotEncoder() )])

In [13]:
preprocessor= ColumnTransformer([('num_pipeline', num_pipeline, num_cols),
                  ('cat_pipeline', cat_pipeline, cat_cols)])

In [14]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X_train

array([[-0.28611937, -1.47443803, -0.57766863, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.02695905, -0.71612531,  1.47042924, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.3716196 ,  1.19880579,  1.47042924, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.23206267,  0.43283335, -0.57766863, ...,  0.        ,
         0.        ,  1.        ],
       [-1.06543688, -1.29060464, -0.57766863, ...,  1.        ,
         0.        ,  0.        ],
       [-0.29287646,  0.1034652 ,  0.44638031, ...,  1.        ,
         0.        ,  0.        ]])

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models = {
    "SVC": SVC(),
    "DTC" :DecisionTreeClassifier(),
    "Log_reg":LogisticRegression()
}

In [19]:
from sklearn.metrics import accuracy_score

def model_train_eval(X_train, y_train, X_test, y_test, models):
    evaluation ={}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_Score = accuracy_score(y_test, y_pred)
        evaluation[list(models.keys())[i]] = model_Score
    return evaluation

In [20]:
model_train_eval(X_train, y_train, X_test, y_test, models)

{'SVC': 0.9183673469387755,
 'DTC': 0.8979591836734694,
 'Log_reg': 0.9183673469387755}