In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import StackingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


### Creating

In [18]:


# Load the dataset
df = pd.read_csv("/auction_verification.csv")

# Create new features
df["bid_start_ratio"] = (df["process.b1.capacity"]/df["property.price"])


# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(df.drop("verification.result", axis=1))
y = df["verification.result"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.176, random_state=42)









In [19]:
# Train a baseline model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_val)
acc = accuracy_score(y_val, y_pred)
cm = confusion_matrix(y_val, y_pred)
print(f"Baseline model accuracy: {acc}")
print(f"Baseline model confusion matrix: \n{cm}")

# Perform hyperparameter tuning
param_grid = {"penalty": ["l1", "l2"], "C": [0.01, 0.1, 1, 10, 100], "solver": ["liblinear", "lbfgs", "sag", "saga"]}
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
cm = confusion_matrix(y_val, y_pred)
print(f"Best model parameters: {best_params}")
print(f"Best model accuracy: {acc}")
print(f"Best model confusion matrix: \n{cm}")

# Train different classifiers

Baseline model accuracy: 0.8660130718954249
Baseline model confusion matrix: 
[[254   3]
 [ 38  11]]




Best model parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best model accuracy: 0.8758169934640523
Best model confusion matrix: 
[[254   3]
 [ 35  14]]


50 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solv

In [20]:
# Stacking classifier
estimators = [("dt", DecisionTreeClassifier()), ("knn", KNeighborsClassifier()), ("svm", SVC())]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"Stacking classifier accuracy: {acc}")
print(f"Stacking classifier confusion matrix: \n{cm}")

Stacking classifier accuracy: 0.9771986970684039
Stacking classifier confusion matrix: 
[[259   2]
 [  5  41]]


In [22]:
# Boosting classifier
boosting = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100)
boosting.fit(X_train, y_train)
y_pred = boosting.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"Boosting classifier accuracy: {acc}")
print(f"Boosting classifier confusion matrix: \n{cm}")

Boosting classifier accuracy: 0.9837133550488599
Boosting classifier confusion matrix: 
[[260   1]
 [  4  42]]




In [24]:
# Bagging classifier
bagging = RandomForestClassifier(n_estimators=100)
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"Bagging classifier accuracy: {acc}")
print(f"Bagging classifier confusion matrix: \n{cm}")

Bagging classifier accuracy: 0.993485342019544
Bagging classifier confusion matrix: 
[[261   0]
 [  2  44]]
