In [12]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import classification_report

In [13]:
df = pd.read_csv('../data/2425_QC_processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  990 non-null    float64
 1   Age                       990 non-null    float64
 2   Area Income               990 non-null    float64
 3   Daily Internet Usage      990 non-null    float64
 4   Ad Topic Line             990 non-null    object 
 5   City                      990 non-null    object 
 6   Male                      990 non-null    int64  
 7   Country                   990 non-null    object 
 8   Timestamp                 990 non-null    object 
 9   Clicked on Ad             990 non-null    int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 77.5+ KB


In [14]:
# Drop categorical columns
df = df.select_dtypes(['number']) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  990 non-null    float64
 1   Age                       990 non-null    float64
 2   Area Income               990 non-null    float64
 3   Daily Internet Usage      990 non-null    float64
 4   Male                      990 non-null    int64  
 5   Clicked on Ad             990 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 46.5 KB


In [15]:
# Split the data into features (X) and target (y)
X = df.drop('Clicked on Ad', axis=1)
y = df['Clicked on Ad']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

In [16]:
# Define categorical and numerical features
categorical_fts = X.select_dtypes(include=["object"]).columns.tolist()

numerical_fts = X.select_dtypes(include=["number"]).columns.tolist()

In [17]:
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_fts),
       ("num", StandardScaler(), numerical_fts),
   ]
)

In [18]:
pipeline = Pipeline(
   [
       ("preprocessor", preprocessor),
       ("classifier", RandomForestClassifier(random_state= 42)),
   ]
)


In [19]:
param_grid = {
    'n_estimators': [50, 200, 500],
    'max_depth': [5, 10, 20]
}

# Fit the model on the training data
best_rf_model = GridSearchCV(pipeline['classifier'], param_grid= param_grid, cv= 5)

best_rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf_model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)

print("Random Forest:")
print(report)

Random Forest:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        99
           1       0.97      0.97      0.97        99

    accuracy                           0.97       198
   macro avg       0.97      0.97      0.97       198
weighted avg       0.97      0.97      0.97       198



In [20]:
pipeline = Pipeline(
   [
       ("preprocessor", preprocessor),
       ("classifier", GradientBoostingClassifier(random_state=42)),
   ]
)

In [21]:
param_grid = {
    'n_estimators': [50, 200, 500],
    'max_depth': [5, 10, 20]
}

# Fit the model on the training data
best_gb_model = GridSearchCV(pipeline['classifier'], param_grid= param_grid, cv= 5)

best_gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_gb_model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)

print("Gradient Boosting:")
print(report)

Gradient Boosting:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97        99
           1       0.98      0.97      0.97        99

    accuracy                           0.97       198
   macro avg       0.97      0.97      0.97       198
weighted avg       0.97      0.97      0.97       198

