## An Ad- Agency analyzed a dataset of online ads and used a machine learning model to predict whether a user would click on an ad or not.

In [48]:
import pandas as pd
import kaggle
import gzip
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [3]:
dataset = 'avazu-ctr-prediction'

kaggle.api.competition_download_file(dataset, file_name='train.gz')

Downloading train.gz to C:\jupyter\iNeuron Assignments\Machine Learning


100%|█████████████████████████████████████████████████████████████████████████████| 1.04G/1.04G [01:41<00:00, 11.1MB/s]







In [31]:
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

with gzip.open(f'Data/train.gz', 'rb') as zip_ref:
    df = pd.read_csv(zip_ref, parse_dates=['hour'], date_parser=parse_date)

  parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')


In [32]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,2014-10-21,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [33]:
df.shape

(40428967, 24)

In [34]:
df['click'].value_counts()

0    33563901
1     6865066
Name: click, dtype: int64

#### Since it's a huge dataset, we will do 2 things...

1. Take sample of 5Million rows
2. Since data is highly imbalanced, we will use stratify method

In [35]:
X = df.drop('click', axis=1)
y = df['click']

# Use stratified sampling to obtain a balanced sample of 5 million rows
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=5000000, stratify=y, random_state=0)

# Combine the sampled features and target variable into a DataFrame
new_df = pd.concat([X_sample, y_sample], axis=1)

### Data Preparation

In [36]:
new_df['month'] = new_df['hour'].dt.month
new_df['dayofweek'] = new_df['hour'].dt.dayofweek
new_df['day'] = new_df['hour'].dt.day
new_df['hour_time'] = new_df['hour'].dt.hour
new_df.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C17,C18,C19,C20,C21,click,month,dayofweek,day,hour_time
35477700,2.357702e+18,2014-10-29 17:00:00,1002,0,294411bf,c4e18dd6,50e219e0,ecad2386,7801e8d9,07d7df22,...,2291,3,35,100200,43,0,10,2,29,17
27157201,6.872195e+18,2014-10-28 00:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1722,0,35,-1,79,1,10,1,28,0
3341989,1.260099e+19,2014-10-21 17:00:00,1010,1,85f751fd,c4e18dd6,50e219e0,ffc6ffd0,7801e8d9,0f2161f8,...,2513,3,35,-1,68,1,10,1,21,17
23794119,1.120772e+19,2014-10-26 23:00:00,1005,0,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,07d7df22,...,1863,3,39,-1,23,0,10,6,26,23
33781004,5.90887e+18,2014-10-29 08:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,2667,0,35,-1,221,0,10,2,29,8


In [37]:
# Dealing with outliers by capping

col = ['C15', 'C16', 'C19', 'C21']
for col in col:
    percentiles = new_df[col].quantile(0.98)
    if new_df[col].quantile(0.98) < 0.5 * new_df[col].max():
        new_df[col][new_df[col] >= percentiles] = percentiles

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col][new_df[col] >= percentiles] = percentiles
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col][new_df[col] >= percentiles] = percentiles


In [39]:
# segrigating numerical and categorical variables.

numerical = []
categorical = []

for col in (new_df.columns):
    if new_df[col].dtype == "object":
        categorical.append(col)
    else:
        numerical.append(col)
print("numerical columns = ",numerical)
print("\ncategorical columns = ",categorical)

numerical columns =  ['id', 'hour', 'C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'click', 'month', 'dayofweek', 'day', 'hour_time']

categorical columns =  ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model']


In [40]:
new_df.drop(['id', 'hour'], axis = 1, inplace = True) 
new_df.rename(columns={'hour_time': 'hour'},
          inplace=True, errors='raise')

In [41]:
X = new_df.drop(['click'], axis=1)
y = new_df['click']

In [44]:
target_encoder = ce.TargetEncoder()
X = target_encoder.fit_transform(X, y)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    test_size= 0.25, 
                                                    random_state= 42)

In [49]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

### Model Training

In [50]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [51]:
y_pred = rf.predict(X_test)

In [52]:
from sklearn.metrics  import accuracy_score, precision_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Accuracy:", accuracy)
print("F1-score:", f1)

Precision: 0.7341620061562092
Accuracy: 0.890688
F1-score: 0.6343786792250883
