<a href="https://colab.research.google.com/github/Rome0607/Tsetlin-machine-project/blob/main/TsetlinMachine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyTsetlinMachine
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
pip install pyTsetlinMachine
import nltk
nltk.download('punkt')

import pandas as pd
import re
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pyTsetlinMachine.tm import MultiClassTsetlinMachine
import numpy as np
from imblearn.over_sampling import SMOTE
from joblib import Parallel, delayed
import multiprocessing

# Load the slang dictionary from the CSV file
slang_df = pd.read_csv("final_Uk_slang.csv")
slang_dict = pd.Series(slang_df.formal_translation.values, index=slang_df.slang).to_dict()

# Define text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = text.strip()
    return text

# Define normalization function with refined handling
def normalize_text(text, slang_dict):
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(normalized_words)

# Load your sentiment dataset
data = pd.read_csv("sentiment_data.csv")  # Assuming you have a CSV file with columns 'text' and 'sentiment'

# Clean and normalize text
data['cleaned_text'] = data['text'].apply(clean_text)
data['normalized_text'] = data['cleaned_text'].apply(lambda x: normalize_text(x, slang_dict))

# Tokenize text
data['tokens'] = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(word_tokenize)(text) for text in data['normalized_text'])

# Determine a fixed length for token sequences
fixed_length = 10

# Pad or truncate token sequences to the fixed length
def pad_or_truncate(tokens, length):
    if len(tokens) > length:
        return tokens[:length]
    else:
        return tokens + [''] * (length - len(tokens))

data['padded_tokens'] = data['tokens'].apply(lambda x: pad_or_truncate(x, fixed_length))

# One-hot encode tokens with improved encoding method
def one_hot_encode(tokens, encoder=None):
    encoded_tokens = encoder.transform([[token] for token in tokens])
    return encoded_tokens

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit encoder on the entire vocabulary
vocab = list(set([token for sublist in data['padded_tokens'] for token in sublist if token]))
encoder.fit([[token] for token in vocab])

# Apply one-hot encoding to all tokenized texts
data['binary_features'] = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(one_hot_encode)(tokens, encoder) for tokens in data['padded_tokens'])

# Ensure correct format for features
X = np.vstack(data['binary_features'].apply(lambda x: x.flatten()))
y = data['sentiment'].values  # Ensure this is numeric, e.g., 0 for negative, 1 for positive

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to train and evaluate model on a fold
def train_and_evaluate(train_index, test_index, X, y, encoder, data):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Balance classes using SMOTE
    smote = SMOTE(random_state=42, n_jobs=multiprocessing.cpu_count())
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    # Train the model with optimized parameters
    tm = MultiClassTsetlinMachine(
        number_of_clauses=500,  # Adjust the number of clauses
        T=50,                   # Adjust the threshold
        s=10.0                  # Adjust specificity
    )
    tm.fit(X_resampled, y_resampled, epochs=30)  # Adjust epochs if needed

    # Predict on the test data
    y_pred = tm.predict(X_test)

    # Get the translated results (normalized text)
    translated_texts = data.iloc[test_index]['normalized_text'].values

    # Calculate metrics for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return accuracy, precision, recall, f1, translated_texts, y_pred, test_index

# Perform cross-validation in parallel
results = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(train_and_evaluate)(train_index, test_index, X, y, encoder, data) for train_index, test_index in skf.split(X, y))

# Collect and print the average metrics
accuracy_list, precision_list, recall_list, f1_list, all_translated_texts, all_predictions, all_test_indices = zip(*results)
print(f"Average Accuracy: {np.mean(accuracy_list)}")
print(f"Average Precision: {np.mean(precision_list)}")
print(f"Average Recall: {np.mean(recall_list)}")
print(f"Average F1 Score: {np.mean(f1_list)}")

# Print translation results by Tsetlin Machine
for i in range(len(all_translated_texts)):
    print(f"Fold {i+1} Translations and Sentiment Predictions:")
    for j in range(len(all_translated_texts[i])):
        sentiment_label = 'Positive' if all_predictions[i][j] == 1 else 'Negative'
        print(f"Original: {data.iloc[all_test_indices[i][j]]['text']}")
        print(f"Translated: {all_translated_texts[i][j]}")
        print(f"Predicted Sentiment: {sentiment_label}")
        print("------")




Average Accuracy: 0.8959999999999999
Average Precision: 0.9004586495083517
Average Recall: 0.8959999999999999
Average F1 Score: 0.8954229022459149
Fold 1 Translations and Sentiment Predictions:
Original: I'm feeling knackered, mate! What a dodgy day.
Translated: im feeling tired friend what a suspicious day
Predicted Sentiment: Negative
------
Original: This nosh is absolutely scrummy!
Translated: this food is absolutely delicious
Predicted Sentiment: Positive
------
Original: He's a proper bloke, always helping out.
Translated: hes a proper man always helping out
Predicted Sentiment: Negative
------
Original: I'm feeling a bit off-colour today, might be coming down with something.
Translated: im feeling a bit offcolour today might be coming down with something
Predicted Sentiment: Negative
------
Original: The service was naff, definitely not going back there.
Translated: the service was uncool definitely not going back there
Predicted Sentiment: Negative
------
Original: The plan wen