In [1]:
import pandas as pd
import numpy as np
import gurobipy as gbtype(y_transformed)
from gurobipy import *
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('KAG_conversion_data.csv')

In [3]:
data.head()

Unnamed: 0,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion
0,708746,916,103916,30-34,M,15,7350,1,1.43,2,1
1,708749,916,103917,30-34,M,16,17861,2,1.82,2,0
2,708771,916,103920,30-34,M,20,693,0,0.0,1,0
3,708815,916,103928,30-34,M,28,4259,1,1.25,1,0
4,708818,916,103928,30-34,M,28,4133,1,1.29,1,1


In [6]:
# Add a binary column for Approved_Conversion
data['Approved_Conversion_Binary'] = data['Approved_Conversion'].apply(lambda x: 1 if x > 0 else 0)

In [7]:
data.head()

Unnamed: 0,ad_id,xyz_campaign_id,fb_campaign_id,age,gender,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion,Approved_Conversion_Binary
0,708746,916,103916,30-34,M,15,7350,1,1.43,2,1,1
1,708749,916,103917,30-34,M,16,17861,2,1.82,2,0,0
2,708771,916,103920,30-34,M,20,693,0,0.0,1,0,0
3,708815,916,103928,30-34,M,28,4259,1,1.25,1,0,0
4,708818,916,103928,30-34,M,28,4133,1,1.29,1,1,1


In [8]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values

ad_id                         0
xyz_campaign_id               0
fb_campaign_id                0
age                           0
gender                        0
interest                      0
Impressions                   0
Clicks                        0
Spent                         0
Total_Conversion              0
Approved_Conversion           0
Approved_Conversion_Binary    0
dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Label encoding for 'gender'
label_encoder = LabelEncoder()
data['gender_encoded'] = label_encoder.fit_transform(data['gender'])

# One-hot encoding for 'age'
one_hot_encoder = OneHotEncoder(sparse=False)
age_one_hot = one_hot_encoder.fit_transform(data[['age']])
age_one_hot_df = pd.DataFrame(age_one_hot, columns=[f"age_{cat}" for cat in one_hot_encoder.categories_[0]])

# Concatenate the one-hot encoded columns with the original dataframe
data = pd.concat([data.drop(columns=['age']), age_one_hot_df], axis=1)



In [10]:
data = data.drop(columns=['gender', 'ad_id', 'xyz_campaign_id', 'fb_campaign_id', 'Approved_Conversion'])

In [11]:
data.head()

Unnamed: 0,interest,Impressions,Clicks,Spent,Total_Conversion,Approved_Conversion_Binary,gender_encoded,age_30-34,age_35-39,age_40-44,age_45-49
0,15,7350,1,1.43,2,1,1,1.0,0.0,0.0,0.0
1,16,17861,2,1.82,2,0,1,1.0,0.0,0.0,0.0
2,20,693,0,0.0,1,0,1,1.0,0.0,0.0,0.0
3,28,4259,1,1.25,1,0,1,1.0,0.0,0.0,0.0
4,28,4133,1,1.29,1,1,1,1.0,0.0,0.0,0.0


## Initialising the predictor and target variables

In [12]:
X = data.drop(['Approved_Conversion_Binary'],axis=1)
y = data['Approved_Conversion_Binary']

In [13]:
# transform y from a series of 0 and 1 to -1 and 1
# 
y_transformed = [1 if yi == 1 else -1 for yi in y]

In [14]:
type(y_transformed)

list

## Building the Linear SVM Optimizer using a Linear Optimization Model

For a binary classifier problem, a support vector machine (SVM) finds the best separating hyperplane $\alpha^T x + \beta = 0$ using two supporting hyperplanes (one for each group), where $\alpha$ is the coefficient matrix and $\beta$ is the constant vector. 

Formulating the linear classification problem as such:

For $y_i = 1$: $\alpha^T x + \beta \geq 1$

Conversely, for $y_i = -1$: $\alpha^T x + \beta \leq 1$

The use of values 1 and -1 is equivalent to using $k$ and $-k$ as we may scale $\alpha$ and $\beta$.

Introducing two arbitrary vectors $x_1$ and $x_2$, where $x_1$ falls on the supporting hyperplane:  $\alpha^T x + \beta = 1$

and $x_2$ fall on the supporting hyperplane: $\alpha^T x + \beta = -1$

Maximizing the distance between the two supporting hyperplanes is equivalent to the projection of $x_1 - x_2$ onto the normal vector, $\alpha$. Therefore the length is:

$$ \frac {(x_1 - x_2)^T \alpha}{||\alpha||} $$

Given that $x_1$ and $x_2$ are supporting, $(x_1 - x_2)^T \alpha = 2 $ and therefore the length becomes: 

$$ \frac {2}{||\alpha||} $$

This form is equivalent to minimizing $ \frac {1}{2} ||\alpha||$ which is the same as minimizing $\frac{1}{2} \sum_{k=1}^{m} \alpha^2_i$.

Thereafter, the optimization problem can be re-evaluated as:

$$ \min \frac{1}{2} \sum_{k=1}^{m} \alpha^2_i $$

subject to: 

$$ y_i(\alpha^T x_i + \beta) \geq 1 \quad \forall i \in \{1, \ldots, n\} $$

### For imperfect separation

Introducing errors into objective function, the SVM problem then becomes: 

$$ \min \frac{1}{2} \sum_{k=1}^{m} \alpha^2_i + C\sum_{i=1}^{n} \gamma_i $$

subject to: 
$$ y_i(\alpha^T x_i + \beta) \geq 1 - \gamma_i \quad \forall i \in \{1, \ldots, n\} $$
$$ \gamma_i \geq 0 \quad \forall i \in \{1, \ldots, n\} $$

where $\gamma_i$ denotes the degree of violation (difference from a data point) and $C \geq 0 $ accounts for the penalty incurred with classification. 

In [15]:
num_features = X.shape[1]
num_samples = len(y_transformed)

# Create gb model
SVM = gb.Model("Linear Classifier")

# Decision Variables
alpha = SVM.addVars(num_features, vtype=gb.GRB.CONTINUOUS, lb=-gb.GRB.INFINITY, ub=gb.GRB.INFINITY, name=["weight {}".format(i) for i in range(num_features)])
bias = SVM.addVar(vtype=gb.GRB.CONTINUOUS, ub=gb.GRB.INFINITY, name="bias")
gamma = SVM.addVars(num_samples, lb=0, vtype=gb.GRB.CONTINUOUS, name="violation")
C = 1  # Regularization parameter

# Objective Function
objective = 0.5 * gb.quicksum(alpha[i]*alpha[i] for i in range(num_features)) + C * gb.quicksum(gamma[i] for i in range(num_samples))
SVM.setObjective(objective, gb.GRB.MINIMIZE)

# Constraints
for i in range(num_samples):
    SVM.addConstr(y_transformed[i] * (sum(alpha[j] * X.iloc[i, j] for j in range(num_features)) + bias) >= 1 - gamma[i])

# Optimize
SVM.optimize()

# Retrieve and print the solution
solution = {var.VarName: var.X for var in alpha.values()}
solution['bias'] = bias.X
print("Optimal solution:", solution)

Restricted license - for non-production use only - expires 2024-10-28
Gurobi Optimizer version 10.0.2 build v10.0.2rc0 (mac64[x86])

CPU model: Intel(R) Core(TM) i5-8257U CPU @ 1.40GHz
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads



GurobiError: Model too large for size-limited license; visit https://www.gurobi.com/free-trial for a full license

## Confusion Matrix for Comparisons with Actual Data

In [None]:
# Convert alpha to a numpy array for easier manipulation
alpha_values = np.array([alpha[i].X for i in range(num_features)])

# Compute predictions
predictions = []
for i in range(num_samples):
    decision_value = sum(alpha_values[j] * X.iloc[i, j] for j in range(num_features)) + bias.X
    predicted_class = 1 if decision_value >= 0 else -1
    predictions.append(predicted_class)

# Actual labels
actual_labels = y_transformed  # Assuming y_transformed is a numpy array or similar structure

# Construct confusion matrix
conf_matrix = confusion_matrix(actual_labels, predictions, labels=[1, -1])

# Print confusion matrix
print("Confusion Matrix:\n", conf_matrix)

# Plotting the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=[1, -1], yticklabels=[1, -1])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(actual_labels, predictions)

print(f'Accuracy: {accuracy:.4f}')

## Cross-Validation to determine best C (a form of hyperparameter)

In [None]:
from sklearn.model_selection import KFold
import numpy as np
import gurobipy as gb

# Assuming X, y_transformed, and other necessary libraries are already defined

# Define range of C values to test
C_values = [0.01, 0.1, 1, 10, 100]

# Initialize k-fold cross-validation
kf = KFold(n_splits=5)  # Example: 5-fold cross-validation

best_C = None
highest_accuracy = 0

all_accuracies = []  # To store mean accuracies for all C values

for C in C_values:
    accuracies = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        
        # Use list comprehension to index y_transformed when it's a list
        y_train = [y_transformed[i] for i in train_index]
        y_val = [y_transformed[i] for i in val_index]
        
        num_features = X_train.shape[1]
        num_samples = len(y_train)
        
        # Create gb model for current C
        SVM = gb.Model("Linear Classifier")
        
        # Suppress Gurobi output
        SVM.setParam('OutputFlag', 0)
        
        # Decision Variables
        alpha = SVM.addVars(num_features, vtype=gb.GRB.CONTINUOUS, lb=-gb.GRB.INFINITY, ub=gb.GRB.INFINITY, name=["weight {}".format(i) for i in range(num_features)])
        bias = SVM.addVar(vtype=gb.GRB.CONTINUOUS, ub=gb.GRB.INFINITY, name="bias")
        gamma = SVM.addVars(num_samples, lb=0, vtype=gb.GRB.CONTINUOUS, name="violation")
        
        # Objective Function
        objective = 0.5 * gb.quicksum(alpha[i]*alpha[i] for i in range(num_features)) + C * gb.quicksum(gamma[i] for i in range(num_samples))
        SVM.setObjective(objective, gb.GRB.MINIMIZE)
        
        # Constraints
        for i in range(num_samples):
            SVM.addConstr(y_train[i] * (sum(alpha[j] * X_train.iloc[i, j] for j in range(num_features)) + bias) >= 1 - gamma[i])
        
        # Optimize
        SVM.optimize()
        
        # Evaluate the model on validation set
        correct_predictions = 0
        for i in range(len(X_val)):
            prediction = sum(alpha[j].X * X_val.iloc[i, j] for j in range(num_features)) + bias.X
            if (prediction > 0 and y_val[i] == 1) or (prediction <= 0 and y_val[i] == -1):
                correct_predictions += 1
        accuracy = correct_predictions / len(X_val)
        accuracies.append(accuracy)
    
    # Compute average accuracy for this C and store
    avg_accuracy = np.mean(accuracies)
    all_accuracies.append((C, avg_accuracy))  # Store C and its mean accuracy
    
    if avg_accuracy > highest_accuracy:
        highest_accuracy = avg_accuracy
        best_C = C

print(f"Best C: {best_C} with highest accuracy: {highest_accuracy}")

# Print mean accuracy for each C
for C, avg_acc in all_accuracies:
    print(f"C = {C}: Mean Accuracy = {avg_acc}")

**Extracted from ChatGPT:** 

The C value in the context of Support Vector Machines (SVM) and many other machine learning models acts as a regularization parameter. Specifically, in SVMs, C plays a crucial role in determining the trade-off between achieving a low training error and maintaining a low complexity of the model, which, in essence, affects the model's generalization to unseen data.

When C is small (C=0.1): The model increases the regularization strength, which encourages the model to be simpler, potentially at the cost of misclassifying more training examples. A smaller C value leads to a wider margin in SVM decision boundaries but allows more misclassifications. This can be beneficial for datasets with noisy observations and can help prevent overfitting, as it forces the model to focus on the broader trend rather than fitting to the noise.

When C is large: It signifies a lower regularization strength, prioritizing a lower training error, even if it means a more complex model boundary. A larger C tries to classify all training examples correctly, which might lead to a narrower margin if the data is linearly separable or not very noisy. However, this can make the model more sensitive to outliers and potentially lead to overfitting, especially if the data has a lot of noise or if the feature space is very high-dimensional.

# Black box implementation

## kernel = linear

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define the features and target variable
X = data.drop(columns=['Approved_Conversion_Binary'])
y = data['Approved_Conversion_Binary']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the SVM model
svm_model = SVC(kernel='linear') 

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 58.31%


In [12]:
# Detailed classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.48      0.53       168
           1       0.58      0.69      0.63       175

    accuracy                           0.58       343
   macro avg       0.58      0.58      0.58       343
weighted avg       0.58      0.58      0.58       343



## Cross-validation

In [13]:
from sklearn.model_selection import cross_val_score

# Define the features and target variable
X = data.drop(columns=['Approved_Conversion_Binary'])
y = data['Approved_Conversion_Binary']

# Initialize the SVM model
svm_model = SVC(kernel='linear')

scores = cross_val_score(svm_model, X, y, cv=5, scoring='accuracy')

print(f'Cross-Validation Accuracy Scores: {scores}')
print(f'Average Accuracy: {scores.mean() * 100:.2f}%')
print(f'Standard Deviation: {scores.std() * 100:.2f}%')


Cross-Validation Accuracy Scores: [0.51091703 0.54585153 0.86899563 0.51754386 0.53070175]
Average Accuracy: 59.48%
Standard Deviation: 13.76%


In [14]:
from sklearn.model_selection import cross_validate

# Define the features and target variable
X = data.drop(columns=['Approved_Conversion_Binary'])
y = data['Approved_Conversion_Binary']

# Initialize the SVM model
svm_model = SVC(kernel='linear')

# Perform cross-validation
cv_results = cross_validate(svm_model, X, y, cv=5, return_estimator=True)

# Iterate over each fold and print the classification report
for i, estimator in enumerate(cv_results['estimator']):
    y_pred = estimator.predict(X)
    print(f'Fold {i + 1} Classification Report:')
    print(classification_report(y, y_pred))


Fold 1 Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.64      0.63       559
           1       0.64      0.63      0.64       584

    accuracy                           0.63      1143
   macro avg       0.63      0.63      0.63      1143
weighted avg       0.63      0.63      0.63      1143

Fold 2 Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.54      0.57       559
           1       0.61      0.67      0.64       584

    accuracy                           0.61      1143
   macro avg       0.61      0.61      0.61      1143
weighted avg       0.61      0.61      0.61      1143

Fold 3 Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.50      0.56       559
           1       0.60      0.71      0.65       584

    accuracy                           0.61      1143
   macro avg       0.61      0.61     