In [3]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('E:/Coding/DatabricksETL/ad_10000records.csv')
df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,61.660757,35.9401,53840.047721,177.759831,0.4917
std,15.704142,8.572973,13343.708718,40.820951,0.499956
min,32.6,19.0,13996.5,105.22,0.0
25%,48.86,29.0,44052.3025,140.15,0.0
50%,59.59,35.0,56180.93,178.92,0.0
75%,76.58,42.0,61840.26,212.67,1.0
max,90.97,60.0,79332.33,269.96,1.0


In [4]:
from sklearn.model_selection import train_test_split

# Split dataset
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [5]:
import pandas as pd

# Get sorted list of unique countries
countries = sorted(df['Country'].unique())

# Make mapping dict: {country_name: index}
country_to_idx = {country: i+1 for i, country in enumerate(countries)}  # start at 1

# Apply mapping
df['CountryIndex'] = df['Country'].map(country_to_idx)

# Confirm
print(df[['Country', 'CountryIndex']].drop_duplicates().sort_values('Country'))

                Country  CountryIndex
37          Afghanistan             1
74              Albania             2
45              Algeria             3
71       American Samoa             4
1945            Andorra             5
...                 ...           ...
1575  Wallis and Futuna           203
19       Western Sahara           204
272               Yemen           205
3                Zambia           206
227            Zimbabwe           207

[207 rows x 2 columns]


In [6]:
# Features: drop target + original Country
X = df.drop(columns=['Clicked on Ad', 'Country'])
y = df['Clicked on Ad']
""
# Split with same random state to align
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, f1_score
from sklearn.preprocessing import StandardScaler

# df is your original DataFrame; target column is 'Clicked on Ad' (0/1)
def prepare_data(df, target_col='Clicked on Ad', country_col='Country', test_size=0.2, random_state=42):
    # 1) map countries alphabetically to indices starting at 1
    countries_sorted = sorted(df[country_col].dropna().unique())
    country_to_idx = {c: i+1 for i, c in enumerate(countries_sorted)}
    df = df.copy()
    df['CountryIndex'] = df[country_col].map(country_to_idx)

    # 2) optionally coerce numeric columns
    # (coerce non-numeric into NaN, then you can impute or drop)
    numeric_cols = df.select_dtypes(include=['number', 'bool']).columns.tolist()
    # ensure CountryIndex and target are included
    if 'CountryIndex' not in numeric_cols:
        numeric_cols = numeric_cols + ['CountryIndex']
    numeric_cols = [c for c in numeric_cols if c != target_col]

    # 3) split
    train_df, val_df = train_test_split(df, test_size=test_size, random_state=random_state, shuffle=True, stratify=df[target_col])

    # 4) scale numeric features (save scaler if needed)
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(train_df[numeric_cols])
    X_val_num   = scaler.transform(val_df[numeric_cols])

    y_train = train_df[target_col].values
    y_val   = val_df[target_col].values

    return {
        'train_df': train_df,
        'val_df': val_df,
        'X_train_num': X_train_num,
        'X_val_num': X_val_num,
        'y_train': y_train,
        'y_val': y_val,
        'numeric_cols': numeric_cols,
        'country_to_idx': country_to_idx,
        'scaler': scaler
    }

In [48]:
from pyffm import PyFFM

# pyFFM expects a DataFrame with a label column (default 'click')
# rename target to 'click' for pyFFM or pass label name if supported
prep = prepare_data(df)
train_df = prep['train_df'].copy()
val_df   = prep['val_df'].copy()

# pyFFM expects the label column named 'click' by default — rename
train_df = train_df.rename(columns={'Clicked on Ad': 'click'})
val_df   = val_df.rename(columns={'Clicked on Ad': 'click'})

# if you want to use only the numeric columns (including CountryIndex), pass only those
features_to_use = prep['numeric_cols']  # e.g., ['Age','Daily Time Spent on Site', 'CountryIndex', ...]
train_input = train_df[features_to_use + ['click']].reset_index(drop=True)
val_input   = val_df[features_to_use + ['click']].reset_index(drop=True)

In [24]:
import numpy as np

def model_size(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    return sum([np.prod(p.size()) for p in model_parameters])

In [61]:
from sklearn.metrics import log_loss, f1_score

# instantiate: you can choose model='ffm' or 'fm'
training_params = {'epochs': 110, 'reg_lambda': 5e-3, 'learn_rate': 5e-3, 'num_latent': 10}
pyffm = PyFFM(model='ffm', training_params=training_params)
pyffm.train(train_input)

preds = pyffm.predict(val_input.drop(columns='click'))  # returns predictions (probabilities/scores)

# evaluate
y_val_proba = np.array(preds).ravel()
val_logloss = log_loss(val_input['click'].values, y_val_proba)
y_val_pred = (y_val_proba >= 0.5).astype(int)
val_f1 = f1_score(val_input['click'].values, y_val_pred) # trains; expects a pandas DataFrame

print("pyFFM val logloss:", val_logloss)
print("pyFFM val F1:", val_f1)

[2025-08-17 18:48:41,794.794] -     INFO: pyffm.pyffm -- Formatting dataframe
[2025-08-17 18:48:42,005.005] -     INFO: pyffm.engine.ffm_engine -- Creating ffm model with 5 fields and 1613 features.
[2025-08-17 18:48:42,007.007] -     INFO: pyffm.engine.ffm_engine -- Epoch 0
[2025-08-17 18:48:42,008.008] -     INFO: pyffm.engine.ffm_engine -- Training on 7200 rows.
[2025-08-17 18:48:42,079.079] -     INFO: pyffm.engine.ffm_engine -- Full train done, took 0.1s
[2025-08-17 18:48:42,080.080] -     INFO: pyffm.engine.ffm_engine -- Calculating logloss
[2025-08-17 18:48:42,083.083] -     INFO: pyffm.engine.ffm_engine -- Logloss: 34.24879870054765, 
Took 0.0s
[2025-08-17 18:48:42,085.085] -     INFO: pyffm.engine.ffm_engine -- Epoch 1
[2025-08-17 18:48:42,089.089] -     INFO: pyffm.engine.ffm_engine -- Training on 7200 rows.
[2025-08-17 18:48:42,184.184] -     INFO: pyffm.engine.ffm_engine -- Full train done, took 0.1s
[2025-08-17 18:48:42,184.184] -     INFO: pyffm.engine.ffm_engine -- Calcu

In [63]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.naive_bayes import CategoricalNB

# Suppose features_to_use contains both categorical and numeric columns
X_train = train_df[features_to_use]
y_train = train_df['click']
X_val = val_df[features_to_use]
y_val = val_df['click']

# Identify numeric columns
numeric_cols = X_train.select_dtypes(include='number').columns
categorical_cols = [c for c in features_to_use if c not in numeric_cols]

# Bin numeric columns
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X_train[numeric_cols] = kbins.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = kbins.transform(X_val[numeric_cols])

# Encode all categorical columns (including binned numeric ones)
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)

# Train Naive Bayes
nb = CategoricalNB()
nb.fit(X_train_encoded, y_train)

y_pred = nb.predict(X_val[numeric_cols])

y_val_pred = (y_pred >= 0.5).astype(int)
val_f1 = f1_score(val_input['click'].values, y_val_pred)

print("Categorical NB val logloss:", val_logloss)
print("Categorical NB val F1:", val_f1)

Categorical NB val logloss: 8.794651426944585
Categorical NB val F1: 0.7025316455696202


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numeric_cols] = kbins.fit_transform(X_train[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[numeric_cols] = kbins.transform(X_val[numeric_cols])


In [64]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train[numeric_cols], y_train)

y_pred = gnb.predict(X_val[numeric_cols])

y_val_pred = (y_pred >= 0.5).astype(int)
val_f1 = f1_score(val_input['click'].values, y_val_pred)

print("Gaussian NB val logloss:", val_logloss)
print("Gaussian NB val F1:", val_f1)

Gaussian NB val logloss: 8.794651426944585
Gaussian NB val F1: 0.6853994490358126


In [65]:
import numpy as np

# Suppose y_val contains your true labels
classes, counts = np.unique(y_val, return_counts=True)
class_probs = counts / counts.sum()
print("Class probabilities:", dict(zip(classes, class_probs)))

y_random = np.random.choice([0, 1], size=len(y_val))
accuracy = f1_score(y_random, y_val)
print("Random uniform F1:", accuracy)

y_random_weighted = np.random.choice(classes, size=len(y_val), p=class_probs)
accuracy = f1_score(y_random_weighted, y_val)
print("Random weighted F1:", accuracy)

Class probabilities: {np.int64(0): np.float64(0.5085), np.int64(1): np.float64(0.4915)}
Random uniform F1: 0.5056566650270536
Random weighted F1: 0.4820253164556962
