 Task 5 : Preprocess Data for Machine Learning

 Task: Clean and prepare data for modeling.
 Details:
 Handle missing values, normalize features, and encode categorical variables using scikit-learn.
 Split data into training and testing sets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Data preprocessing and data Uploaded

In [2]:
df = pd.read_csv('ad_click_dataset.csv')
df

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0
...,...,...,...,...,...,...,...,...,...
9995,8510,User8510,,,Mobile,Top,Education,,0
9996,7843,User7843,,Female,Desktop,Bottom,Entertainment,,0
9997,3914,User3914,,Male,Mobile,Side,,Morning,0
9998,7924,User7924,,,Desktop,,Shopping,Morning,1


In [3]:
df.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0


In [4]:
df.tail()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
9995,8510,User8510,,,Mobile,Top,Education,,0
9996,7843,User7843,,Female,Desktop,Bottom,Entertainment,,0
9997,3914,User3914,,Male,Mobile,Side,,Morning,0
9998,7924,User7924,,,Desktop,,Shopping,Morning,1
9999,3056,User3056,44.0,Male,Tablet,Top,Social Media,Morning,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB


In [6]:
df.describe()

Unnamed: 0,id,age,click
count,10000.0,5234.0,10000.0
mean,5060.2114,40.197363,0.65
std,2861.758265,13.12642,0.476993
min,5.0,18.0,0.0
25%,2529.0,29.0,0.0
50%,5218.0,39.5,1.0
75%,7466.0,52.0,1.0
max,10000.0,64.0,1.0


In [7]:
df.drop_duplicates(inplace= True)

In [8]:
df.isnull().sum()

id                     0
full_name              0
age                 3476
gender              3400
device_type         1728
ad_position         1743
browsing_history    3480
time_of_day         1712
click                  0
dtype: int64

Fill the all missing values

In [9]:
df['age'] = df['age'].mean()
df.fillna(df['age'], inplace = True)


df['gender'] = df['gender'].mode()[0]
df.fillna(df['gender'], inplace = True)


df['device_type'] = df['device_type'].mode()[0]
df.fillna(df['device_type'], inplace = True)


df['ad_position'] = df['ad_position'].mode()[0]
df.fillna(df['ad_position'], inplace = True)


df['browsing_history'] = df['browsing_history'].mode()[0]
df.fillna(df['browsing_history'], inplace = True)


df['time_of_day'] = df['time_of_day'].mode()[0]
df.fillna(df['time_of_day'], inplace = True)

In [10]:
df.isnull().sum()

id                  0
full_name           0
age                 0
gender              0
device_type         0
ad_position         0
browsing_history    0
time_of_day         0
click               0
dtype: int64

In [11]:
df.shape

(7147, 9)

In [12]:
df['device_type'].unique()

array(['Desktop'], dtype=object)

In [13]:
df['ad_position'].unique()

array(['Bottom'], dtype=object)

In [14]:
df['browsing_history'].unique()

array(['Entertainment'], dtype=object)

In [15]:
df['time_of_day'].unique()

array(['Morning'], dtype=object)

Encode the categorical dataset into a numerical data

In [16]:
ohe = OneHotEncoder(sparse_output= False)
df['gender'] = ohe.fit_transform(df[['gender']])

oe_device = OrdinalEncoder(categories= [['Mobile', 'Tablet', 'Desktop']])
df['device_type'] = oe_device.fit_transform(df[['device_type']])


oe_ads = OrdinalEncoder(categories= [['Top', 'Side', 'Bottom']])
df['ad_position'] = oe_ads.fit_transform(df[['ad_position']])


oe_history = OrdinalEncoder(categories= [['Education', 'News', 'Social Media', 'Shopping', 'Entertainment']])
df['browsing_history'] = oe_history.fit_transform(df[['browsing_history']])


oe_time = OrdinalEncoder(categories= [['Morning', 'Afternoon', 'Evening', 'Night']])
df['time_of_day'] = oe_time.fit_transform(df[['time_of_day']])

in full_name feature col have both numerical and categorical value so we can split this it into the numeric and categorical col

In [17]:
df['full_name_numeric'] = df['full_name'].str.extract('(\d+)')
df['full_name_numeric'] = pd.to_numeric(df['full_name_numeric'], errors= 'coerce')

df['full_name_categorical'] = df['full_name'].str.extract('([a-zA-Z]+)')

df

  df['full_name_numeric'] = df['full_name'].str.extract('(\d+)')


Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click,full_name_numeric,full_name_categorical
0,670,User670,40.339417,1.0,2.0,2.0,4.0,0.0,1,670,User
1,3044,User3044,40.339417,1.0,2.0,2.0,4.0,0.0,1,3044,User
2,5912,User5912,40.339417,1.0,2.0,2.0,4.0,0.0,1,5912,User
3,5418,User5418,40.339417,1.0,2.0,2.0,4.0,0.0,1,5418,User
4,9452,User9452,40.339417,1.0,2.0,2.0,4.0,0.0,0,9452,User
...,...,...,...,...,...,...,...,...,...,...,...
9995,8510,User8510,40.339417,1.0,2.0,2.0,4.0,0.0,0,8510,User
9996,7843,User7843,40.339417,1.0,2.0,2.0,4.0,0.0,0,7843,User
9997,3914,User3914,40.339417,1.0,2.0,2.0,4.0,0.0,0,3914,User
9998,7924,User7924,40.339417,1.0,2.0,2.0,4.0,0.0,1,7924,User


In [18]:
df['full_name_categorical'].unique()

array(['User'], dtype=object)

In [19]:
ohe_user = OneHotEncoder(sparse_output= False)
df['full_name_categorical'] = ohe_user.fit_transform(df[['full_name_categorical']])

In [20]:
df = df.drop(columns = 'full_name')
df

Unnamed: 0,id,age,gender,device_type,ad_position,browsing_history,time_of_day,click,full_name_numeric,full_name_categorical
0,670,40.339417,1.0,2.0,2.0,4.0,0.0,1,670,1.0
1,3044,40.339417,1.0,2.0,2.0,4.0,0.0,1,3044,1.0
2,5912,40.339417,1.0,2.0,2.0,4.0,0.0,1,5912,1.0
3,5418,40.339417,1.0,2.0,2.0,4.0,0.0,1,5418,1.0
4,9452,40.339417,1.0,2.0,2.0,4.0,0.0,0,9452,1.0
...,...,...,...,...,...,...,...,...,...,...
9995,8510,40.339417,1.0,2.0,2.0,4.0,0.0,0,8510,1.0
9996,7843,40.339417,1.0,2.0,2.0,4.0,0.0,0,7843,1.0
9997,3914,40.339417,1.0,2.0,2.0,4.0,0.0,0,3914,1.0
9998,7924,40.339417,1.0,2.0,2.0,4.0,0.0,1,7924,1.0


Initialize the feature cols and target col

In [21]:
X = df.drop(columns = 'click')
Y = df['click']

split the data into train and test data

In [22]:
xtrain , xtest, ytrain, ytest = train_test_split(X, Y, test_size= 0.2, random_state= 2)

create the model's object then train the model and predicted it

In [23]:
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
y_pred = lr.predict(xtest)
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

random input data

In [24]:
y_pred_input = lr.predict([[7924,	40.339417,	1.0,	2.0,	2.0,	4.0,	0.0, 7924,	1.0]])
y_pred_input



array([1], dtype=int64)

Check the performance of the model

In [25]:
acc_score = accuracy_score(ytest, y_pred)
print("The accuracy_score of the logistic regressor is", acc_score)

The accuracy_score of the logistic regressor is 0.4993006993006993


In [26]:
f1 = f1_score(ytest, y_pred)
print("the f1 score of the logistic regressor is", f1)

the f1 score of the logistic regressor is 0.6660447761194029


evaluate the K fold Cross val

In [27]:
kf = KFold(n_splits= 5)

cross_val_lr = cross_val_score(lr, X, Y, cv= kf)
cross_val_lr_mean = cross_val_lr.sum()/ len(cross_val_lr)
print("The cross validation score of logistic regressor is", cross_val_lr_mean)

The cross validation score of logistic regressor is 0.46355180159238946


Data can may have a more missing values when we fill the missing values by mean / median/ mode it can affect on the variance, correlation, co-variance and statistics etce

Check the  co- variance, correlation and variance of the transform data

In [28]:
X.var()

id                       8.212268e+06
age                      2.019767e-28
gender                   0.000000e+00
device_type              0.000000e+00
ad_position              0.000000e+00
browsing_history         0.000000e+00
time_of_day              0.000000e+00
full_name_numeric        8.212268e+06
full_name_categorical    0.000000e+00
dtype: float64

In [29]:
X.cov()

Unnamed: 0,id,age,gender,device_type,ad_position,browsing_history,time_of_day,full_name_numeric,full_name_categorical
id,8212268.0,-1.348538e-26,0.0,0.0,0.0,0.0,0.0,8212268.0,0.0
age,-1.348538e-26,2.019767e-28,0.0,0.0,0.0,0.0,0.0,-1.348538e-26,0.0
gender,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
device_type,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ad_position,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
browsing_history,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
time_of_day,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
full_name_numeric,8212268.0,-1.348538e-26,0.0,0.0,0.0,0.0,0.0,8212268.0,0.0
full_name_categorical,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
X.corr()

Unnamed: 0,id,age,gender,device_type,ad_position,browsing_history,time_of_day,full_name_numeric,full_name_categorical
id,1.0,,,,,,,1.0,
age,,,,,,,,,
gender,,,,,,,,,
device_type,,,,,,,,,
ad_position,,,,,,,,,
browsing_history,,,,,,,,,
time_of_day,,,,,,,,,
full_name_numeric,1.0,,,,,,,1.0,
full_name_categorical,,,,,,,,,
