In [None]:
# Imports Libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load dataset
df=sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [None]:
# Checking null values
df.isnull().sum()

Unnamed: 0,0
total_bill,0
tip,0
sex,0
smoker,0
day,0
time,0
size,0


In [None]:
# Checking duplicates
df.duplicated().sum()

np.int64(1)

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Check duplicates
df.duplicated().any()

np.False_

In [None]:
# Separate features and target
X=df.drop('tip',axis=1)
y=df['tip']

In [None]:
# Identify categorical columns
category=X.select_dtypes(include=['object','category']).columns
category

Index(['sex', 'smoker', 'day', 'time'], dtype='object')

In [None]:
# Identify numeric columns
num=X.select_dtypes(exclude=['object','category']).columns
num

Index(['total_bill', 'size'], dtype='object')

In [None]:
# One-hot encode categorical columns
encoder=OneHotEncoder()

In [None]:
X_encoded=encoder.fit_transform(X[category])

In [None]:
encoded_cols=encoder.get_feature_names_out(category)
encoded_cols

array(['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'],
      dtype=object)

In [None]:
X_encoded.toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [None]:
X_encoded_df=pd.DataFrame(X_encoded.toarray(), columns=encoded_cols)
X_encoded_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
# Scale numeric columns
scaler=StandardScaler()

In [None]:
X_num_scaled=scaler.fit_transform(X[num])

In [None]:
X_num_scaled_df=pd.DataFrame(X_num_scaled, columns=num)

In [None]:
X_num_scaled_df

Unnamed: 0,total_bill,size
0,-0.314711,-0.600193
1,-1.063235,0.453383
2,0.137780,0.453383
3,0.438315,-0.600193
4,0.540745,1.506958
...,...,...
239,1.040511,0.453383
240,0.832275,-0.600193
241,0.324630,-0.600193
242,-0.221287,-0.600193


In [None]:
# Combine encoded categorical and scaled numeric features
X_df=pd.concat([X_encoded_df,X_num_scaled_df],axis=1)
X_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,total_bill,size
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-0.314711,-0.600193
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.063235,0.453383
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.137780,0.453383
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.438315,-0.600193
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.540745,1.506958
...,...,...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.040511,0.453383
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.832275,-0.600193
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.324630,-0.600193
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.221287,-0.600193


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

In [None]:
# Create KNN regressor
model=KNeighborsRegressor(n_neighbors=3)

In [None]:
# Train KNN regressor
model.fit(X_train,y_train)

In [None]:
# Predict on test set
y_pred=model.predict(X_test)

In [None]:
y_pred

array([3.35666667, 1.79333333, 4.05666667, 3.05666667, 2.17      ,
       3.71666667, 4.        , 1.81      , 2.01      , 2.6       ,
       3.01666667, 2.20333333, 1.83333333, 3.55      , 1.83333333,
       3.19333333, 3.66666667, 3.58333333, 2.47666667, 5.29      ,
       3.02333333, 3.13      , 1.94333333, 2.20333333, 3.5       ,
       2.07666667, 1.79333333, 2.87333333, 2.83333333, 7.47666667,
       4.18333333, 1.74666667, 2.71333333, 3.05666667, 2.33666667,
       3.78333333, 2.23333333, 4.13333333, 1.82666667, 3.46      ,
       2.20333333, 1.99333333, 3.51666667, 2.40333333, 2.00666667,
       1.74333333, 1.77666667, 2.59666667, 1.73666667])

In [None]:
# Evaluate on test set
model.score(X_test,y_test)

0.19945919959781178

In [None]:
mean_squared_error(y_test, y_pred)

1.0006530612244897

In [None]:
r2_score(y_test, y_pred)

0.19945919959781178

In [None]:
# Prepare a new sample and predict
new_data = pd.DataFrame({
    'sex':['Female'],
    'smoker':['No'],
    'day':['Sun'],
    'time':['Dinner'],
    'total_bill':[20],
    'size':[3]
})
new_data

Unnamed: 0,sex,smoker,day,time,total_bill,size
0,Female,No,Sun,Dinner,20,3


In [None]:
# Encode the new sample exactly like training data
new_encoded=encoder.transform(new_data[category])

In [None]:
new_encodeddf=pd.DataFrame(new_encoded.toarray(),columns=encoded_cols)
new_encodeddf

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [None]:
# Scale the new sample exactly like training data
new_scaled=scaler.transform(new_data[num])

In [None]:
new_scaleddf=pd.DataFrame(new_scaled,columns=num)
new_scaled

array([[0.0240943 , 0.45338292]])

In [None]:
new_df=pd.concat([new_encodeddf,new_scaleddf],axis=1)
new_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,total_bill,size
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.024094,0.453383


In [None]:
# Predict tip for new sample
model.predict(new_df)

array([3.16666667])