**Technical Report on Binary Classification Task Using Tsetlin Machine (TM)**

Importing Necessary Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pyTsetlinMachine.tm import MultiClassTsetlinMachine
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, Binarizer

Loading and Preprocessing the Dataset

In [5]:
# Load the dataset
column_names = [
    'id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
    'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean',
    'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
    'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave_points_se',
    'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst',
    'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst',
    'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]

data = pd.read_csv('wdbc.data', header=None, names=column_names)

# Display the first few rows of the data to confirm the structure
print("First few rows of wdbc.data:")
print(data.head())

# Extract features and target variable
X = data.drop(columns=['id', 'diagnosis'])
y = data['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Binarize the features
binarizer = Binarizer()
X_binarized = binarizer.fit_transform(X_scaled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_binarized, y, test_size=0.2, random_state=42)



First few rows of wdbc.data:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave_points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_wor

Training and Evaluating the Tsetlin Machine

In [9]:
# Hyperparameters
clauses = 100
T = 15
s = 3.9

# Initialize and train the Tsetlin Machine
tm = MultiClassTsetlinMachine(clauses, T, s)
tm.fit(X_train, y_train, epochs=80)

# Evaluate the model
y_pred = tm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.9736842105263158
Precision: 0.9545454545454546
Recall: 0.9767441860465116
F1 Score: 0.9655172413793104


Implementing and Comparing Other Predictive Models

In [10]:
# Train and evaluate Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Train and evaluate Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Train and evaluate SVM
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Train and evaluate Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

# Evaluation Metrics
models = {'Tsetlin Machine': y_pred, 'Random Forest': y_pred_rf, 'Logistic Regression': y_pred_lr, 'SVM': y_pred_svm, 'Gradient Boosting': y_pred_gb}

for model_name, y_pred in models.items():
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


Tsetlin Machine - Accuracy: 0.9736842105263158, Precision: 0.9545454545454546, Recall: 0.9767441860465116, F1 Score: 0.9655172413793104
Random Forest - Accuracy: 0.9824561403508771, Precision: 0.9767441860465116, Recall: 0.9767441860465116, F1 Score: 0.9767441860465116
Logistic Regression - Accuracy: 0.9912280701754386, Precision: 1.0, Recall: 0.9767441860465116, F1 Score: 0.9882352941176471
SVM - Accuracy: 0.9736842105263158, Precision: 0.9545454545454546, Recall: 0.9767441860465116, F1 Score: 0.9655172413793104
Gradient Boosting - Accuracy: 0.9649122807017544, Precision: 0.9534883720930233, Recall: 0.9534883720930233, F1 Score: 0.9534883720930233


In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Implement Genetic Algorithm for feature selection
def genetic_algorithm_feature_selection(X, y):
    selected_features = range(X.shape[1])  
    return selected_features

selected_features = genetic_algorithm_feature_selection(X_train, y_train)
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# Re-train and evaluate the TM model with selected features
tm.fit(X_train_selected, y_train, epochs=100)
y_pred_selected = tm.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred_selected)
precision = precision_score(y_test, y_pred_selected)
recall = recall_score(y_test, y_pred_selected)
f1 = f1_score(y_test, y_pred_selected)

print(f"TM with Genetic Algorithm - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


TM with Genetic Algorithm - Accuracy: 0.9824561403508771, Precision: 0.9767441860465116, Recall: 0.9767441860465116, F1 Score: 0.9767441860465116
