In [1]:
import pandas as ps

In [2]:
df = ps.read_csv("tips.csv")

# **1. Read the tips dataset from the provided source only. Handle the missing values with the appropriate techniques.**

In [3]:
df.isnull().sum()

Unnamed: 0     0
total_bill    39
tip            0
sex            0
smoker         0
day            0
time           0
size           0
dtype: int64

In [4]:
median1 = df.total_bill.median()
median1

19.44

In [5]:
df['total_bill'] = df.total_bill.fillna(median1)

In [6]:
df.isnull().sum()

Unnamed: 0    0
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

# **2. Handle the categorical data in the tips dataset with the relevant approaches such as label-encoding, one hot encoding, ordinal encoding.**

In [14]:
#Label Encoder
from sklearn.preprocessing import LabelEncoder

label1 = LabelEncoder()
df['sex'] = label1.fit_transform(df['sex'])
df['smoker'] = label1.fit_transform(df['smoker'])
df['day'] = label1.fit_transform(df['day'])
df['time'] = label1.fit_transform(df['time'])
df.head()

Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0,16.99,1.01,0,0,2,0,2
1,1,19.44,1.66,1,0,2,0,3
2,2,21.01,3.5,1,0,2,0,3
3,3,23.68,3.31,1,0,2,0,2
4,4,24.59,3.61,0,0,2,0,4


In [16]:
#One Hot Encoder
from sklearn.preprocessing import OneHotEncoder
da = ps.read_csv("tips.csv")

In [15]:
categorical_columns = ['sex', 'smoker', 'day', 'time']

# Create one-hot encoder object
encoder = OneHotEncoder()

# Fit encoder to the selected categorical columns
encoder.fit(da[categorical_columns])

# Transform the selected categorical columns
onehot_encoded = encoder.transform(da[categorical_columns])

# Convert sparse matrix to array and convert to DataFrame
onehot_encoded_df = ps.DataFrame(onehot_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded DataFrame with the non-categorical columns
final_df = ps.concat([da.drop(columns=categorical_columns), onehot_encoded_df], axis=1)
final_df

Unnamed: 0.1,Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,0,16.99,1.01,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1,,1.66,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2,21.01,3.50,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,3,23.68,3.31,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,4,24.59,3.61,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,239,29.03,5.92,3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,240,27.18,2.00,2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,241,22.67,2.00,2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,242,17.82,1.75,2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [17]:
#Ordinal Encoder
from sklearn.preprocessing import OrdinalEncoder
ds = ps.read_csv("tips.csv")
enc = OrdinalEncoder()

In [18]:
ds['sex'] = enc.fit_transform(ds[['sex']])
ds['smoker'] = enc.fit_transform(ds[['smoker']])
ds['day'] = enc.fit_transform(ds[['day']])
ds['time'] = enc.fit_transform(ds[['time']])
ds['size'] = enc.fit_transform(ds[['size']])

In [19]:
ds.head()

Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0,16.99,1.01,0.0,0.0,2.0,0.0,1.0
1,1,,1.66,1.0,0.0,2.0,0.0,2.0
2,2,21.01,3.5,1.0,0.0,2.0,0.0,2.0
3,3,23.68,3.31,1.0,0.0,2.0,0.0,1.0
4,4,24.59,3.61,0.0,0.0,2.0,0.0,3.0


# **3. Perform feature scaling techniques such as min-max normalization, standardization, z-score, on the tips dataset.**

In [26]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Load the tips dataset
tips_df = pd.read_csv('tips.csv')

# Display the first few rows of the dataset
print("Original Tips Dataset:")
tips_df.head()


Original Tips Dataset:


Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0,16.99,1.01,Female,No,Sun,Dinner,2
1,1,,1.66,Male,No,Sun,Dinner,3
2,2,21.01,3.5,Male,No,Sun,Dinner,3
3,3,23.68,3.31,Male,No,Sun,Dinner,2
4,4,24.59,3.61,Female,No,Sun,Dinner,4


In [27]:
# Perform Min-Max normalization
min_max_scaler = MinMaxScaler()
min_max_scaled = min_max_scaler.fit_transform(tips_df[['total_bill', 'tip']])
min_max_df = pd.DataFrame(min_max_scaled, columns=['total_bill_minmax', 'tip_minmax'])

# Perform Standardization (Z-score normalization)
standard_scaler = StandardScaler()
standard_scaled = standard_scaler.fit_transform(tips_df[['total_bill', 'tip']])
standard_df = pd.DataFrame(standard_scaled, columns=['total_bill_standard', 'tip_standard'])

# Concatenate the scaled features with the original DataFrame
scaled_tips_df = pd.concat([tips_df, min_max_df, standard_df], axis=1)

# Display the scaled DataFrame
print("\nScaled Tips Dataset:")
scaled_tips_df.head()


Scaled Tips Dataset:


Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_minmax,tip_minmax,total_bill_standard,tip_standard
0,0,16.99,1.01,Female,No,Sun,Dinner,2,0.291579,0.001111,-0.494024,-1.439947
1,1,,1.66,Male,No,Sun,Dinner,3,,0.073333,,-0.969205
2,2,21.01,3.5,Male,No,Sun,Dinner,3,0.375786,0.277778,-0.039587,0.363356
3,3,23.68,3.31,Male,No,Sun,Dinner,2,0.431713,0.256667,0.26224,0.225754
4,4,24.59,3.61,Female,No,Sun,Dinner,4,0.450775,0.29,0.36511,0.44302


In [29]:
##standardization
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
pd.DataFrame(df_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.724967,-0.499244,-1.439947,-1.343353,-0.784789,0.279158,-0.621582,-0.600193
1,-1.710769,-0.198219,-0.969205,0.744406,-0.784789,0.279158,-0.621582,0.453383
2,-1.696572,-0.005318,0.363356,0.744406,-0.784789,0.279158,-0.621582,0.453383
3,-1.682375,0.322738,0.225754,0.744406,-0.784789,0.279158,-0.621582,-0.600193
4,-1.668178,0.434547,0.443020,-1.343353,-0.784789,0.279158,-0.621582,1.506958
...,...,...,...,...,...,...,...,...
239,1.668178,0.980077,2.115963,0.744406,-0.784789,-0.802024,-0.621582,0.453383
240,1.682375,0.752773,-0.722971,-1.343353,1.274228,-0.802024,-0.621582,-0.600193
241,1.696572,0.198642,-0.722971,0.744406,1.274228,-0.802024,-0.621582,-0.600193
242,1.710769,-0.397264,-0.904026,0.744406,-0.784789,-0.802024,-0.621582,-0.600193


In [30]:
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
df_minmax = pd.DataFrame(min_max.fit_transform(df), columns= df.columns)
df_minmax.head()

Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0.0,0.291579,0.001111,0.0,0.0,0.666667,0.0,0.2
1,0.004115,0.342899,0.073333,1.0,0.0,0.666667,0.0,0.4
2,0.00823,0.375786,0.277778,1.0,0.0,0.666667,0.0,0.4
3,0.012346,0.431713,0.256667,1.0,0.0,0.666667,0.0,0.2
4,0.016461,0.450775,0.29,0.0,0.0,0.666667,0.0,0.6


# **4. Create a new feature representing the average tip percentage for each dining party size.**

In [32]:
df['tip_percentage'] = df['tip'] / df['total_bill'] * 100
average_tip_percentage = df['tip_percentage'].mean()
print("Average tip percentage:", average_tip_percentage)
df

Average tip percentage: 14.995471884027804


Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
0,0,16.99,1.01,0,0,2,0,2,5.944673
1,1,19.44,1.66,1,0,2,0,3,8.539095
2,2,21.01,3.50,1,0,2,0,3,16.658734
3,3,23.68,3.31,1,0,2,0,2,13.978041
4,4,24.59,3.61,0,0,2,0,4,14.680765
...,...,...,...,...,...,...,...,...,...
239,239,29.03,5.92,1,0,1,0,3,20.392697
240,240,27.18,2.00,0,1,1,0,2,7.358352
241,241,22.67,2.00,1,1,1,0,2,8.822232
242,242,17.82,1.75,1,0,1,0,2,9.820426


# **5. Create a new feature based on total bill and tips if total bill is greater than 10dollars and tips is the greatest than 30dollars  mark as Highest-bills-with-tips otherwise Normal-bills**

In [34]:
df['bill_level'] = 'Normal-bills'
df.loc[(df['total_bill'] > 10) & (df['tip'] > 3), 'bill_level'] = 'Highest-bills-with-tips'
df

Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,bill_level
0,0,16.99,1.01,0,0,2,0,2,5.944673,Normal-bills
1,1,19.44,1.66,1,0,2,0,3,8.539095,Normal-bills
2,2,21.01,3.50,1,0,2,0,3,16.658734,Highest-bills-with-tips
3,3,23.68,3.31,1,0,2,0,2,13.978041,Highest-bills-with-tips
4,4,24.59,3.61,0,0,2,0,4,14.680765,Highest-bills-with-tips
...,...,...,...,...,...,...,...,...,...,...
239,239,29.03,5.92,1,0,1,0,3,20.392697,Highest-bills-with-tips
240,240,27.18,2.00,0,1,1,0,2,7.358352,Normal-bills
241,241,22.67,2.00,1,1,1,0,2,8.822232,Normal-bills
242,242,17.82,1.75,1,0,1,0,2,9.820426,Normal-bills
