<a href="https://colab.research.google.com/github/NITHIN-KANDI/PR_PROJECT/blob/main/hmm_smoothing_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Define the HMM model components with Laplace smoothing
class HMM:
    def __init__(self, n_states, n_observations):
        self.n_states = n_states  # Number of hidden states
        self.n_observations = n_observations  # Number of possible observations

        # Initialize the transition probabilities (states to states)
        self.transition_probs = np.full((n_states, n_states), 1.0 / n_states)

        # Initialize the emission probabilities (states to observations)
        self.emission_probs = np.full((n_states, n_observations), 1.0 / n_observations)

        # Initialize start probabilities
        self.start_probs = np.full(n_states, 1.0 / n_states)

    def train(self, sequences, state_sequences, alpha=1.0):
        # Calculate start probabilities with smoothing
        for state_seq in state_sequences:
            self.start_probs[state_seq[0]] += 1
        self.start_probs += alpha  # Laplace smoothing
        self.start_probs /= self.start_probs.sum()

        # Calculate transition probabilities with smoothing
        for state_seq in state_sequences:
            for i in range(len(state_seq) - 1):
                self.transition_probs[state_seq[i], state_seq[i + 1]] += 1
        self.transition_probs += alpha  # Laplace smoothing
        self.transition_probs /= self.transition_probs.sum(axis=1, keepdims=True)

        # Calculate emission probabilities with smoothing
        for seq, state_seq in zip(sequences, state_sequences):
            for obs, state in zip(seq, state_seq):
                self.emission_probs[state, obs] += 1
        self.emission_probs += alpha  # Laplace smoothing
        self.emission_probs /= self.emission_probs.sum(axis=1, keepdims=True)

    def viterbi(self, sequence):
        # Initialize matrices
        T = len(sequence)
        viterbi_probs = np.zeros((self.n_states, T))
        backpointer = np.zeros((self.n_states, T), dtype=int)

        # Initialize the first column
        for s in range(self.n_states):
            viterbi_probs[s, 0] = self.start_probs[s] * self.emission_probs[s, sequence[0]]
            backpointer[s, 0] = 0

        # Fill the Viterbi matrix
        for t in range(1, T):
            for s in range(self.n_states):
                probabilities = viterbi_probs[:, t - 1] * self.transition_probs[:, s] * self.emission_probs[s, sequence[t]]
                viterbi_probs[s, t] = np.max(probabilities)
                backpointer[s, t] = np.argmax(probabilities)

        # Backtrace to get the most probable state sequence
        best_path = np.zeros(T, dtype=int)
        best_path[-1] = np.argmax(viterbi_probs[:, T - 1])
        for t in range(T - 2, -1, -1):
            best_path[t] = backpointer[best_path[t + 1], t + 1]

        return best_path


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Load filtered datasets
train_data = pd.read_csv('/content/filtered_test_data.csv')
test_data = pd.read_csv('/content/filtered_test_data.csv')

# Define state mapping and vocabulary
state_mapping = {'truth': 0, 'partial truth': 1, 'partial lie': 2, 'lie': 3}
n_states = len(state_mapping)

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and stem
    tokens = [stemmer.stem(word) for word in word_tokenize(text.lower()) if word.isalnum() and word not in stop_words]
    return tokens

# Apply preprocessing and create vocabulary
train_data['tokens'] = train_data['statement'].apply(preprocess_text)
all_tokens = [token for tokens in train_data['tokens'] for token in tokens]
vocab = list(set(all_tokens))
word_to_index = {word: i for i, word in enumerate(vocab)}
n_observations = len(vocab)

# Encode tokens and labels
def encode_tokens(tokens):
    return [word_to_index.get(token, -1) for token in tokens if token in word_to_index]

encoded_train_sequences = [encode_tokens(tokens) for tokens in train_data['tokens']]
encoded_state_sequences = [[state_mapping[label]] for label in train_data['label']]


In [22]:
# Initialize and train the HMM with smoothing
hmm_model = HMM(n_states=n_states, n_observations=n_observations)
hmm_model.train(encoded_train_sequences, encoded_state_sequences, alpha=1.0)  # Alpha can be adjusted for smoothing
print("HMM model training completed.")


HMM model training completed.


In [23]:
def classify_sentence(sentence, hmm_model, word_to_index, state_mapping):
    # Tokenize and encode the sentence
    tokens = preprocess_text(sentence)
    encoded_tokens = [word_to_index.get(token, -1) for token in tokens if token in word_to_index]
    if not encoded_tokens:
        return "Unknown"  # If no tokens are recognized

    # Use the Viterbi algorithm to find the best state path
    best_path = hmm_model.viterbi(encoded_tokens)
    return list(state_mapping.keys())[best_path[0]]

# Test with a sample sentence
sample_sentence = "This is an entirely true statement."
classification = classify_sentence(sample_sentence, hmm_model, word_to_index, state_mapping)
print(f"The sentence classification is: {classification}")


The sentence classification is: truth


In [25]:
# Preprocess the test set
test_data['tokens'] = test_data['statement'].apply(preprocess_text)
encoded_test_sequences = [encode_tokens(tokens) for tokens in test_data['tokens']]
true_labels = [state_mapping[label] for label in test_data['label']]
# Predict labels using the model, with error handling for empty sequences
most_common_label = max(state_mapping, key=list(state_mapping.values()).count)  # Default label if needed
predicted_labels = []

for seq in encoded_test_sequences:
    if len(seq) > 0:  # Check if the sequence is non-empty
        predicted_state = hmm_model.viterbi(seq)[0]
    else:
        predicted_state = state_mapping[most_common_label]  # Default label for empty sequences
    predicted_labels.append(predicted_state)




# Evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.52
Precision: 0.62
Recall: 0.52
F1 Score: 0.47
