In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('KAG_conversion_data.csv')

In [3]:
df.head()

Unnamed: 0,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion
0,708746,916,103916,30-34,M,15,7350,1,1.43,2,1
1,708749,916,103917,30-34,M,16,17861,2,1.82,2,0
2,708771,916,103920,30-34,M,20,693,0,0.0,1,0
3,708815,916,103928,30-34,M,28,4259,1,1.25,1,0
4,708818,916,103928,30-34,M,28,4133,1,1.29,1,1


In [4]:
# Add a binary column for Approved_Conversion
df['Approved_Conversion_Binary'] = df['Approved_Conversion'].apply(lambda x: 1 if x > 0 else 0)

In [5]:
df.head()

Unnamed: 0,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,Approved_Conversion_Binary
0,708746,916,103916,30-34,M,15,7350,1,1.43,2,1,1
1,708749,916,103917,30-34,M,16,17861,2,1.82,2,0,0
2,708771,916,103920,30-34,M,20,693,0,0.0,1,0,0
3,708815,916,103928,30-34,M,28,4259,1,1.25,1,0,0
4,708818,916,103928,30-34,M,28,4133,1,1.29,1,1,1


In [6]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

ad_id                         0
xyz_campaign_id               0
fb_campaign_id                0
age                           0
gender                        0
interest                      0
Impressions                   0
Clicks                        0
Spent                         0
Total_Conversion              0
Approved_Conversion           0
Approved_Conversion_Binary    0
dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Label encoding for 'gender'
label_encoder = LabelEncoder()
df['gender_encoded'] = label_encoder.fit_transform(df['gender'])

# One-hot encoding for 'age'
one_hot_encoder = OneHotEncoder(sparse=False)
age_one_hot = one_hot_encoder.fit_transform(df[['age']])
age_one_hot_df = pd.DataFrame(age_one_hot, columns=[f"age_{cat}" for cat in one_hot_encoder.categories_[0]])

# Concatenate the one-hot encoded columns with the original dataframe
df = pd.concat([df.drop(columns=['age']), age_one_hot_df], axis=1)



In [8]:
df = df.drop(columns=['gender', 'ad_id', 'xyz_campaign_id', 'fb_campaign_id', 'Approved_Conversion'])

In [9]:
df.head()

Unnamed: 0,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion_Binary,gender_encoded,age_30-34,age_35-39,age_40-44,age_45-49
0,15,7350,1,1.43,2,1,1,1.0,0.0,0.0,0.0
1,16,17861,2,1.82,2,0,1,1.0,0.0,0.0,0.0
2,20,693,0,0.0,1,0,1,1.0,0.0,0.0,0.0
3,28,4259,1,1.25,1,0,1,1.0,0.0,0.0,0.0
4,28,4133,1,1.29,1,1,1,1.0,0.0,0.0,0.0


# Black box implementation

## kernel = linear

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define the features and target variable
X = df.drop(columns=['Approved_Conversion_Binary'])
y = df['Approved_Conversion_Binary']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the SVM model
svm_model = SVC(kernel='linear') 

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 58.31%


In [12]:
# Detailed classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.48      0.53       168
           1       0.58      0.69      0.63       175

    accuracy                           0.58       343
   macro avg       0.58      0.58      0.58       343
weighted avg       0.58      0.58      0.58       343



## Cross-validation

In [13]:
from sklearn.model_selection import cross_val_score

# Define the features and target variable
X = df.drop(columns=['Approved_Conversion_Binary'])
y = df['Approved_Conversion_Binary']

# Initialize the SVM model
svm_model = SVC(kernel='linear')

scores = cross_val_score(svm_model, X, y, cv=5, scoring='accuracy')

print(f'Cross-Validation Accuracy Scores: {scores}')
print(f'Average Accuracy: {scores.mean() * 100:.2f}%')
print(f'Standard Deviation: {scores.std() * 100:.2f}%')


Cross-Validation Accuracy Scores: [0.51091703 0.54585153 0.86899563 0.51754386 0.53070175]
Average Accuracy: 59.48%
Standard Deviation: 13.76%


In [14]:
from sklearn.model_selection import cross_validate

# Define the features and target variable
X = df.drop(columns=['Approved_Conversion_Binary'])
y = df['Approved_Conversion_Binary']

# Initialize the SVM model
svm_model = SVC(kernel='linear')

# Perform cross-validation
cv_results = cross_validate(svm_model, X, y, cv=5, return_estimator=True)

# Iterate over each fold and print the classification report
for i, estimator in enumerate(cv_results['estimator']):
    y_pred = estimator.predict(X)
    print(f'Fold {i + 1} Classification Report:')
    print(classification_report(y, y_pred))


Fold 1 Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.64      0.63       559
           1       0.64      0.63      0.64       584

    accuracy                           0.63      1143
   macro avg       0.63      0.63      0.63      1143
weighted avg       0.63      0.63      0.63      1143

Fold 2 Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.54      0.57       559
           1       0.61      0.67      0.64       584

    accuracy                           0.61      1143
   macro avg       0.61      0.61      0.61      1143
weighted avg       0.61      0.61      0.61      1143

Fold 3 Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.50      0.56       559
           1       0.60      0.71      0.65       584

    accuracy                           0.61      1143
   macro avg       0.61      0.61     