In [14]:
import pandas as pd
from pyTsetlinMachine.tm import MultiClassTsetlinMachine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin

# class to use with GridSearchCV
class TsetlinMachineClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, number_of_clauses=10, T=15, s=3.9):
        self.number_of_clauses = number_of_clauses
        self.T = T
        self.s = s
        self.tm = MultiClassTsetlinMachine(number_of_clauses=self.number_of_clauses, T=self.T, s=self.s)

    def fit(self, X, y):
        self.tm.fit(X, y, epochs=50)
        return self

    def predict(self, X):
        return self.tm.predict(X)

    def score(self, X, y):
        return accuracy_score(y, self.predict(X))

# Loading the dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
dataset = pd.read_csv(url)

# Preprocessing the dataset
# 	Cleaning
dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# 	Inputing Missing values
dataset['Age'].fillna(dataset['Age'].median(), inplace=True)  # Fill missing Age values with the median
dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)  # Fill missing Embarked values with the mode
dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)  # Fill missing Fare values with the median

# Converting variables to numerical
label_encoders = {}
for column in ['Sex', 'Embarked']:
    label_encoders[column] = LabelEncoder()
    dataset[column] = label_encoders[column].fit_transform(dataset[column])

# Target variable
X = dataset.drop('Survived', axis=1)
y = dataset['Survived']

# 	Standardization
scaler = StandardScaler()
X[['Age', 'Fare']] = scaler.fit_transform(X[['Age', 'Fare']])

# 	Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Converting to numpy arrays
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

# GridSearchCV for hyperparameter tuning
param_grid = {
    'number_of_clauses': [10, 20, 30],
    'T': [10, 15, 20],
    's': [2.5, 3.9, 5.0]
}
grid_search = GridSearchCV(estimator=TsetlinMachineClassifier(), param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

# Initializing and training Tsetlin Machine
tm = TsetlinMachineClassifier(number_of_clauses=best_params['number_of_clauses'], T=best_params['T'], s=best_params['s'])
tm.fit(X_train, y_train)

# evaluating Tsetlin Machine
y_pred_tm = tm.predict(X_test)
print("Tsetlin Machine Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tm)}")
print(f"Precision: {precision_score(y_test, y_pred_tm)}")
print(f"Recall: {recall_score(y_test, y_pred_tm)}")
print(f"F1-Score: {f1_score(y_test, y_pred_tm)}")

# evaluating Logistic regression model
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(f"Precision: {precision_score(y_test, y_pred_lr)}")
print(f"Recall: {recall_score(y_test, y_pred_lr)}")
print(f"F1-Score: {f1_score(y_test, y_pred_lr)}")

# evaluating Random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_rf)}")
print(f"F1-Score: {f1_score(y_test, y_pred_rf)}")

# evaluating SVM model
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("\nSVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(f"Precision: {precision_score(y_test, y_pred_svm)}")
print(f"Recall: {recall_score(y_test, y_pred_svm)}")
print(f"F1-Score: {f1_score(y_test, y_pred_svm)}")

# evaluating Gradient boosting model
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("\nGradient Boosting Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")
print(f"Precision: {precision_score(y_test, y_pred_gb)}")
print(f"Recall: {recall_score(y_test, y_pred_gb)}")
print(f"F1-Score: {f1_score(y_test, y_pred_gb)}")

# results
print("\nAnalysis:")
print("The Tsetlin Machine showed the following results compared to other models:")
print(f"Accuracy: TM={accuracy_score(y_test, y_pred_tm)}, LR={accuracy_score(y_test, y_pred_lr)}, RF={accuracy_score(y_test, y_pred_rf)}, SVM={accuracy_score(y_test, y_pred_svm)}, GB={accuracy_score(y_test, y_pred_gb)}")
print(f"Precision: TM={precision_score(y_test, y_pred_tm)}, LR={precision_score(y_test, y_pred_lr)}, RF={precision_score(y_test, y_pred_rf)}, SVM={precision_score(y_test, y_pred_svm)}, GB={precision_score(y_test, y_pred_gb)}")
print(f"Recall: TM={recall_score(y_test, y_pred_tm)}, LR={recall_score(y_test, y_pred_lr)}, RF={recall_score(y_test, y_pred_rf)}, SVM={recall_score(y_test, y_pred_svm)}, GB={recall_score(y_test, y_pred_gb)}")
print(f"F1-Score: TM={f1_score(y_test, y_pred_tm)}, LR={f1_score(y_test, y_pred_lr)}, RF={f1_score(y_test, y_pred_rf)}, SVM={f1_score(y_test, y_pred_svm)}, GB={f1_score(y_test, y_pred_gb)}")




Tsetlin Machine Performance:
Accuracy: 0.7821229050279329
Precision: 0.7272727272727273
Recall: 0.7567567567567568
F1-Score: 0.7417218543046358

Logistic Regression Performance:
Accuracy: 0.8100558659217877
Precision: 0.7857142857142857
Recall: 0.7432432432432432
F1-Score: 0.7638888888888888

Random Forest Performance:
Accuracy: 0.8156424581005587
Precision: 0.7971014492753623
Recall: 0.7432432432432432
F1-Score: 0.7692307692307693

SVM Performance:
Accuracy: 0.8156424581005587
Precision: 0.8059701492537313
Recall: 0.7297297297297297
F1-Score: 0.7659574468085106

Gradient Boosting Performance:
Accuracy: 0.8100558659217877
Precision: 0.8125
Recall: 0.7027027027027027
F1-Score: 0.7536231884057971

Analysis:
The Tsetlin Machine showed the following results compared to other models:
Accuracy: TM=0.7821229050279329, LR=0.8100558659217877, RF=0.8156424581005587, SVM=0.8156424581005587, GB=0.8100558659217877
Precision: TM=0.7272727272727273, LR=0.7857142857142857, RF=0.7971014492753623, SVM=0