# Viterbi Encoding and HMM Implementation

### Task 1 : Creating vocab.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

import json
from collections import Counter

In [None]:
data_directory = "/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/"

In [None]:
import json
from collections import Counter

# Define the path to the JSON training data file
train_file = "/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/train.json"
# Define the threshold for rare words
threshold = 2

# Initialize a Counter to keep track of word frequencies
word_counts = Counter()

# Read the JSON training data file
with open(train_file, "r", encoding="utf-8") as file:
    training_data = json.load(file)

# Extract words from the "sentence" field of each JSON object and update word counts
for entry in training_data:
    words = entry["sentence"]
    word_counts.update(words)

# Calculate the cumulative frequency for words that don't meet the threshold
unk_frequency = sum(freq for word, freq in word_counts.items() if freq < threshold)

# Initialize the vocabulary with the special token "< unk >" and assign it the cumulative frequency
vocabulary = {"< unk >": unk_frequency}

# Create a list of (word, frequency) pairs for words that meet the threshold
word_freq_pairs = [(word, freq) for word, freq in word_counts.items() if freq >= threshold]

# Sort the list of word frequency pairs by frequency in descending order
word_freq_pairs.sort(key=lambda x: x[1], reverse=True)

# Assign an index to each word based on its frequency, starting from 1
index = 1
for word, freq in word_freq_pairs:
    vocabulary[word] = index
    index += 1

# Save the vocabulary to the "vocab.txt" file
with open("/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/vocab.txt", "w", encoding="utf-8") as vocab_file:
    # Write "< unk >" as the first line with its cumulative frequency
    vocab_file.write(f"< unk > 0 {unk_frequency}\n")
    # Write the sorted vocabulary, excluding "< unk >"
    for word, index in vocabulary.items():
        if word != "< unk >":
            freq = word_counts.get(word, 0)
            vocab_file.write(f"{word} {index} {freq}\n")

In [None]:
len(vocabulary)

23183

### Task 2 - Model Learning from HMM

In [None]:
import numpy as np

In [None]:
# Collect states and words from the training data
states = set()
words = set()
for entry in training_data:
    states.update(entry["labels"])
    words.update(entry["sentence"])


In [None]:

# Create dictionaries to store counts and probabilities
transition_counts = {s1: {s2: 0 for s2 in states} for s1 in states}
emission_counts = {s: {w: 0 for w in words} for s in states}
state_counts = {s: 0 for s in states}
print(transition_counts)


{'.': {'.': 0, 'NNP': 0, "''": 0, 'VBP': 0, 'POS': 0, '-RRB-': 0, 'FW': 0, 'CD': 0, 'SYM': 0, 'IN': 0, 'RB': 0, 'DT': 0, 'PRP$': 0, ',': 0, 'WP': 0, 'TO': 0, 'LS': 0, ':': 0, '-LRB-': 0, 'WDT': 0, '``': 0, 'JJS': 0, 'NNPS': 0, 'CC': 0, 'RBR': 0, 'VBD': 0, 'UH': 0, 'PDT': 0, 'NN': 0, 'MD': 0, 'PRP': 0, 'EX': 0, 'VB': 0, '#': 0, 'JJR': 0, '$': 0, 'VBZ': 0, 'RP': 0, 'VBN': 0, 'JJ': 0, 'VBG': 0, 'RBS': 0, 'WP$': 0, 'NNS': 0, 'WRB': 0}, 'NNP': {'.': 0, 'NNP': 0, "''": 0, 'VBP': 0, 'POS': 0, '-RRB-': 0, 'FW': 0, 'CD': 0, 'SYM': 0, 'IN': 0, 'RB': 0, 'DT': 0, 'PRP$': 0, ',': 0, 'WP': 0, 'TO': 0, 'LS': 0, ':': 0, '-LRB-': 0, 'WDT': 0, '``': 0, 'JJS': 0, 'NNPS': 0, 'CC': 0, 'RBR': 0, 'VBD': 0, 'UH': 0, 'PDT': 0, 'NN': 0, 'MD': 0, 'PRP': 0, 'EX': 0, 'VB': 0, '#': 0, 'JJR': 0, '$': 0, 'VBZ': 0, 'RP': 0, 'VBN': 0, 'JJ': 0, 'VBG': 0, 'RBS': 0, 'WP$': 0, 'NNS': 0, 'WRB': 0}, "''": {'.': 0, 'NNP': 0, "''": 0, 'VBP': 0, 'POS': 0, '-RRB-': 0, 'FW': 0, 'CD': 0, 'SYM': 0, 'IN': 0, 'RB': 0, 'DT': 0, 'PRP$'

In [None]:
print(emission_counts)
# print(state_counts)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:

print(state_counts)

{'.': 0, 'NNP': 0, "''": 0, 'VBP': 0, 'POS': 0, '-RRB-': 0, 'FW': 0, 'CD': 0, 'SYM': 0, 'IN': 0, 'RB': 0, 'DT': 0, 'PRP$': 0, ',': 0, 'WP': 0, 'TO': 0, 'LS': 0, ':': 0, '-LRB-': 0, 'WDT': 0, '``': 0, 'JJS': 0, 'NNPS': 0, 'CC': 0, 'RBR': 0, 'VBD': 0, 'UH': 0, 'PDT': 0, 'NN': 0, 'MD': 0, 'PRP': 0, 'EX': 0, 'VB': 0, '#': 0, 'JJR': 0, '$': 0, 'VBZ': 0, 'RP': 0, 'VBN': 0, 'JJ': 0, 'VBG': 0, 'RBS': 0, 'WP$': 0, 'NNS': 0, 'WRB': 0}


In [None]:

# Count state transitions and emissions
for entry in training_data:
    sentence = entry["sentence"]
    labels = entry["labels"]

    for i in range(len(labels)):
        state = labels[i]
        word = sentence[i]
        state_counts[state] += 1
        emission_counts[state][word] += 1

        if i < len(labels) - 1:
            next_state = labels[i + 1]
            transition_counts[state][next_state] += 1


In [None]:
# Initialize dictionaries to store transition and emission parameters
from collections import defaultdict
transition_params = defaultdict(float)  # Initialize with float for probabilities
emission_params = defaultdict(float)    # Initialize with float for probabilities

# Initialize dictionaries to count state and state transition frequencies
state_counts = defaultdict(int)
state_transition_counts = defaultdict(lambda: defaultdict(int))

In [None]:

# Process training data and count state and state transition frequencies
for entry in training_data:
    sentence = entry["sentence"]
    labels = entry["labels"]

    # Add start state to the beginning of labels
    labels.insert(0, "<s>")

    for i in range(len(labels)):
        state = labels[i]
        state_counts[state] += 1

        # Count state transitions
        if i < len(labels) - 1:
            next_state = labels[i + 1]
            state_transition_counts[state][next_state] += 1



# Calculate transition probabilities
for state in state_transition_counts:
    total_transitions = sum(state_transition_counts[state].values())
    for next_state, count in state_transition_counts[state].items():
        transition_prob = count / total_transitions
        transition_params[(state, next_state)] = transition_prob

# Calculate emission probabilities
for entry in training_data:
    sentence = entry["sentence"]
    labels = entry["labels"]

    for state, word in zip(labels, sentence):
        emission_count = state_transition_counts[state][word]
        emission_prob = emission_count / state_counts[state]
        emission_params[(state, word)] = emission_prob


In [None]:
print(transition_params)


defaultdict(<class 'float'>, {('NNP', 'NNP'): 0.3789629683508051, ('NNP', ','): 0.13902552119795977, ('NNP', 'CD'): 0.019247729277096386, ('NNP', 'VBZ'): 0.039095529471664836, ('NNP', 'VBG'): 0.0017547528344258867, ('NNP', 'NN'): 0.057243128733395765, ('NNP', 'WDT'): 0.0009182903434042205, ('NNP', 'NNS'): 0.02436651604280506, ('NNP', 'IN'): 0.04147762917435697, ('NNP', 'CC'): 0.04135034140398411, ('NNP', 'POS'): 0.054861029030703626, ('NNP', '.'): 0.05469737332593852, ('NNP', 'VBD'): 0.06515315446370935, ('NNP', 'MD'): 0.01150135925154791, ('NNP', 'TO'): 0.007682726140362043, ('NNP', 'VBP'): 0.004318692209079255, ('NNP', ':'): 0.006891723567330685, ('NNP', 'RB'): 0.009128351532453836, ('NNP', 'JJ'): 0.008473728713393401, ('NNP', "''"): 0.002473019538672752, ('NNP', 'NNPS'): 0.01595643121459809, ('NNP', 'DT'): 0.002318455817505705, ('NNP', 'JJR'): 0.00010910380317673907, ('NNP', 'PRP'): 0.000709174720648804, ('NNP', 'VBN'): 0.000772818605835235, ('NNP', '-RRB-'): 0.003673161373616882, (

In [None]:


# Create the HMM model dictionary
hmm_model = {"transition": dict(transition_params), "emission": dict(emission_params)}

# Save the HMM model as a JSON file
with open("hmm.json", "w", encoding="utf-8") as model_file:
    json.dump(hmm_model, model_file, indent=4)


print(hmm_model)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



TypeError: ignored

##recechek task 2

In [None]:

import json
from collections import defaultdict
train = "/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/train.json"

transition_params = defaultdict(float)
emission_params = defaultdict(float)
initial_probabilities = defaultdict(float)

state = defaultdict(int)
state_transition = defaultdict(lambda: defaultdict(int))
state_emission = defaultdict(lambda: defaultdict(int))

# Read the training data
with open(train, "r", encoding="utf-8") as file:
    t = json.load(file)



In [None]:

# Processing training data to find count state and state transition frequenci

for entry in t:
    sentence = entry['sentence']
    labels = entry["labels"]

    # Extract the first label as the initial state
    initial_state = '$S'
    initial_probabilities[initial_state] += 1

    # for transition probab
    for i in range(len(labels)):
        state = labels[i]
        state_counts[state] += 1
        if i < len(labels) - 1:
            next_state = labels[i + 1]
            state_transition_counts[state][next_state] += 1

    print(len(sentence))
    print(len(labels))
    # for emission probab
    for i in range(len(sentence)):
        word = sentence[i]
        state = labels[i]
        state_emission_counts[state][word] += 1


print(state_transition_counts)
print(state_emission_counts)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
59
23
23
23
23
31
31
22
22
21
21
15
15
19
19
14
14
19
19
19
19
23
23
50
50
36
36
36
36
30
30
34
34
26
26
35
35
41
41
23
23
15
15
30
30
19
19
11
11
12
12
31
31
26
26
29
29
26
26
24
24
17
17
22
22
22
22
12
12
11
11
19
19
15
15
10
10
39
39
18
18
41
41
20
20
26
26
19
19
10
10
25
25
21
21
17
17
11
11
21
21
8
8
25
25
14
14
13
13
23
23
14
14
17
17
24
24
25
25
9
9
29
29
12
12
23
23
19
19
14
14
19
19
18
18
21
21
16
16
28
28
26
26
21
21
12
12
14
14
22
22
19
19
43
43
24
24
27
27
15
15
19
19
17
17
34
34
22
22
32
32
8
8
21
21
54
54
27
27
33
33
27
27
28
28
33
33
21
21
16
16
48
48
57
57
43
43
8
8
17
17
19
19
23
23
21
21
15
15
18
18
20
20
37
37
72
72
19
19
29
29
11
11
31
31
28
28
12
12
46
46
40
40
37
37
31
31
34
34
11
11
14
14
17
17
26
26
34
34
42
42
29
29
22
22
29
29
13
13
16
16
34
34
27
27
32
32
33
33
20
20
25
25
28
28
27
27
32
32
42
42
41
41
22
22
18
18
40
40
53
53
16
16
34
34
26
26
33
33
22
22
8
8
43
43
20
20
42
42
10
10
28
28
33
33


KeyboardInterrupt: ignored

In [None]:

# Calculate transition probabilities
for state in state_transition_counts:
    total_transitions = sum(state_transition_counts[state].values())
    for next_state, count in state_transition_counts[state].items():
        transition_prob = count / total_transitions
        transition_params[(state, next_state)] = transition_prob

transition_params

defaultdict(float,
            {('NNP', 'NNP'): 0.37875307160409166,
             ('NNP', ','): 0.1386479227384422,
             ('NNP', 'CD'): 0.0192010972055546,
             ('NNP', 'VBZ'): 0.039247957026115776,
             ('NNP', 'VBG'): 0.0017715298017029544,
             ('NNP', 'NN'): 0.057649008514772275,
             ('NNP', 'WDT'): 0.000982913309331962,
             ('NNP', 'NNS'): 0.024458540488027886,
             ('NNP', 'IN'): 0.04100805760329162,
             ('NNP', 'CC'): 0.041202354420252586,
             ('NNP', 'POS'): 0.05488313617921024,
             ('NNP', '.'): 0.054654551688667924,
             ('NNP', 'VBD'): 0.06488370764043659,
             ('NNP', 'MD'): 0.011177781587519286,
             ('NNP', 'TO'): 0.007543288187896451,
             ('NNP', 'VBP'): 0.00430881764672267,
             ('NNP', ':'): 0.007040402308703355,
             ('NNP', 'RB'): 0.009234813417909596,
             ('NNP', 'JJ'): 0.008503343048174182,
             ('NNP', "''"): 0.0025

In [None]:

# Calculate emission probabilities
for entry in training_data:
    sentence = entry["sentence"]
    labels = entry["labels"]
    for state, word in zip(labels, sentence):
        emission_count = state_emission_counts[state][word]
        emission_prob = emission_count / state_counts[state]
        emission_params[(state, word)] = emission_prob


emission_params

defaultdict(float,
            {('NNP', 'Pierre'): 6.84868961738654e-05,
             ('NNP', 'Vinken'): 2.2828965391288468e-05,
             (',', ','): 0.9999139414802065,
             ('CD', '61'): 0.0007168253240050465,
             ('NNS', 'years'): 0.019530237301024905,
             ('JJ', 'old'): 0.003613599348534202,
             ('MD', 'will'): 0.3138709335593939,
             ('VB', 'join'): 0.0015693044058221193,
             ('DT', 'the'): 0.5016439225642653,
             ('NN', 'board'): 0.0023287907538381922,
             ('IN', 'as'): 0.0353954283543342,
             ('DT', 'a'): 0.2341478895588702,
             ('JJ', 'nonexecutive'): 0.00010179153094462541,
             ('NN', 'director'): 0.002422883309548826,
             ('NNP', 'Nov.'): 0.0026709889507807506,
             ('CD', '29'): 0.0021218029590549374,
             ('.', '.'): 0.9886228651373967,
             ('NNP', 'Mr.'): 0.044014245274404167,
             ('VBZ', 'is'): 0.3208940997045086,
             ('

In [None]:

# Calculate initial probabilities
total_sentences = len(training_data)
for state, count in initial_probabilities.items():
    initial_probabilities[state] = count / total_sentences
initial_probabilities

defaultdict(float,
            {'NNP': 0.19789104610393007,
             'DT': 0.21911141347009264,
             'IN': 0.1288398137003506,
             'PRP': 0.06148935056779528,
             'EX': 0.004238840337013972,
             '``': 0.07472918520069077,
             'CD': 0.011225077188759224,
             'RBR': 0.0020932544874143074,
             'NNS': 0.041237113402061855,
             'NN': 0.0411847820398765,
             'JJ': 0.041708095661730074,
             'JJR': 0.0017007692710241248,
             'RB': 0.05604688890051808,
             'WRB': 0.00609660369459417,
             'CC': 0.05691035637657648,
             'VBG': 0.012010047621539588,
             'WDT': 0.0008111361138730441,
             'VBN': 0.005834946883667382,
             '-LRB-': 0.003427704223140928,
             'VB': 0.0030613846878434245,
             'WP': 0.003113716050028782,
             'PRP$': 0.007797372965618295,
             'TO': 0.0035323669475116437,
             'JJS': 0.00248573

In [None]:

# Convert keys to strings in transition_params and emission_params
transition_params = {str(key): value for key, value in transition_params.items()}
emission_params = {str(key): value for key, value in emission_params.items()}

# Create the HMM model dictionary
hmm_model = {
    "transition": dict(transition_params),
    "emission": dict(emission_params),
    "initial": dict(initial_probabilities)
}

# # Save the HMM model as a JSON file
# with open("/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/hmm.json", "w", encoding="utf-8") as model_file:
#     json.dump(hmm_model, model_file, indent=4)

In [None]:
# Save the HMM model as a JSON file
with open("/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/hmm.json", "w", encoding="utf-8") as model_file:
    json.dump(hmm_model, model_file, indent=4)

### Task 3 - Greedy Decoding of HMM

In [None]:
import json

# Load the HMM model
with open("/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/hmm.json", "r", encoding="utf-8") as model_file:
    hmm_model = json.load(model_file)

# Extract transition, emission, and initial probabilities
transition_probs = hmm_model["transition"]
emission_probs = hmm_model["emission"]
initial_probs = hmm_model["initial"]

In [None]:
def greedy_decode(sentence, transition_probs, emission_probs, initial_probs):
    # Initialize the list to store the predicted tags
    predicted_tags = []

    # Calculate the set of all possible tags (S)
    S = set(initial_probs.keys())

    # Calculate the number of words in the sentence
    T = len(sentence)

    # Calculate the initial state probabilities for each tag
    initial_probabilities = {s: initial_probs[s] + emission_probs.get((s, sentence[0]), 1e-10) for s in S}

    # Initialize y1 based on the initial probabilities
    y1 = max(initial_probabilities, key=initial_probabilities.get)
    predicted_tags.append(y1)

    # Iterate from the second word to the last word
    for i in range(1, T):
        wi = sentence[i]
        yi_minus_1 = predicted_tags[i - 1]

        # Calculate the tag with the maximum probability for the current word
        max_probability = -float("inf")
        max_tag = None

        for s in S:
            transition_prob = transition_probs.get((s, yi_minus_1), 1e-10)
            emission_prob = emission_probs.get((s, wi), 1e-10)
            probability = transition_prob * emission_prob

            if probability > max_probability:
                max_probability = probability
                max_tag = s

        predicted_tags.append(max_tag)

    return predicted_tags


In [None]:
dev_file = "/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/dev.json"  # Replace with the path to your development data file

# Initialize a list to store predictions
predictions = []

# Load the development data
with open(dev_file, "r", encoding="utf-8") as file:
    dev_data = json.load(file)

# Perform greedy decoding on each sentence in the development data
for entry in dev_data:
    sentence = entry["sentence"]
    predicted_labels = greedy_decode(sentence, transition_probs, emission_probs, initial_probs)
    predictions.append({
        "index": entry["index"],
        "sentence": sentence,
        "labels": predicted_labels
    })

In [None]:
correct_count = 0
total_count = 0

for entry in dev_data:
    true_labels = entry["labels"]
    predicted_labels = predictions[entry["index"]]["labels"]

    for true_label, predicted_label in zip(true_labels, predicted_labels):
        total_count += 1
        if true_label == predicted_label:
            correct_count += 1

accuracy = correct_count / total_count
print("Accuracy on dev data:", accuracy)

with open("/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/greedy.json", "w", encoding="utf-8") as output_file:
    json.dump(predictions, output_file, indent=4)

Accuracy on dev data: 0.009076558800315706


In [None]:
sentence_accuracies = []

for entry in dev_data:
    true_labels = entry["labels"]
    predicted_labels = predictions[entry["index"]]["labels"]

    correct_count = sum(1 for true, predicted in zip(true_labels, predicted_labels) if true == predicted)
    sentence_accuracy = correct_count / len(true_labels)
    sentence_accuracies.append(sentence_accuracy)

average_accuracy = sum(sentence_accuracies) / len(sentence_accuracies)

print("Average accuracy on dev data:", average_accuracy)

Average accuracy on dev data: 0.012025555211643255


In [None]:
emission_probs = hmm_model["emission"]
def greedy_decode(hmm_model, observations):
    # Initialize the sequence of predicted states
    predicted_states = []

    # Extract transition and emission probabilities from the model
    transition_probs = hmm_model["transition"]
    emission_probs = hmm_model["emission"]
    initial_probs = hmm_model.get("initial", {})

    # Initialize the initial state probabilities
    current_state_probs = initial_probs

    for observation in observations:
        # Calculate the probability of each state given the observation
        state_probabilities = {}
        for state in current_state_probs.keys():
            max_prob = -float("inf")

            # Find the maximum probability for transitioning to the current state
            for prev_state in current_state_probs.keys():
                transition_key = (prev_state, state)
                emission_key = (state, observation)

                transition_prob = transition_probs.get(transition_key, 1e-10)
                emission_prob = emission_probs.get(emission_key, 1e-10)

                prob = current_state_probs[prev_state] * transition_prob * emission_prob
                if prob > max_prob:
                    max_prob = prob

            state_probabilities[state] = max_prob

        # Select the state with the highest probability
        predicted_state = max(state_probabilities, key=state_probabilities.get)
        predicted_states.append(predicted_state)

        # Update current state probabilities for the next observation
        current_state_probs = {state: state_probabilities[state] for state in predicted_states}

    return predicted_states

In [None]:
import json

# Load the HMM model from the saved JSON file
with open("/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/hmm.json", "r", encoding="utf-8") as model_file:
    hmm_model = json.load(model_file)

# Load the development data
with open("/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/dev.json", "r", encoding="utf-8") as dev_file:
    dev_data = json.load(dev_file)

# Initialize variables to calculate accuracy
total_sentences = len(dev_data)
correct_predictions = 0

# Initialize a list to store the predictions
predictions = []


In [None]:
for entry in dev_data:
    sentence = entry["sentence"]
    actual_labels = entry["labels"]

    # Predict part-of-speech tags using greedy decoding
    predicted_labels = greedy_decode(hmm_model, sentence)

    # Compare predicted and actual labels to calculate accuracy for this sentence
    correct_predictions += sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == actual)

    # Append the predictions to the list
    predictions.append({
        "index": entry["index"],
        "sentence": sentence,
        "predicted_labels": predicted_labels
    })

In [None]:
# Calculate accuracy
accuracy = correct_predictions / (total_sentences * len(actual_labels))

# Save the predictions in "greedy.json"
with open("greedy.json", "w", encoding="utf-8") as output_file:
    json.dump(predictions, output_file, indent=4)

# Print accuracy
print(f"Accuracy on dev data: {accuracy}")

Accuracy on dev data: 0.14030587809576517


### rechck task 3

In [None]:
import json

# Load the development data from the JSON file
with open('/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/dev.json', 'r') as dev_data_file:
    development_data = json.load(dev_data_file)

# Initialize an empty set to collect unique tags
unique_tags = set()

# Iterate over data points and collect unique tags
for data_point in development_data:
    labels = data_point.get('labels', [])
    unique_tags.update(labels)

# Convert the set of unique tags back to a list if needed
unique_tags_list = list(unique_tags)

# Now, unique_tags_list contains all unique tags from the development data
print("Unique Tags:", unique_tags_list)

Unique Tags: ['JJS', 'RP', 'JJR', 'PRP$', 'NNPS', 'MD', 'VBN', '-LRB-', 'IN', 'SYM', '.', '``', '#', 'RBR', 'WRB', 'LS', 'VBG', 'PDT', 'NNS', 'VBZ', '-RRB-', 'CC', 'VBP', 'RBS', 'VBD', ':', 'NN', '$', ',', 'PRP', 'WDT', 'JJ', 'RB', 'TO', 'NNP', 'POS', 'CD', 'VB', 'WP', "''", 'FW', 'DT', 'WP$', 'EX', 'UH']


In [None]:
import numpy as np

def extract_probabilities_from_json(json_data):
    transition_prob = json_data.get('transition', {})
    emission_prob = json_data.get('emission', {})
    initial_prob = json_data.get('initial', {})

    return transition_prob, emission_prob, initial_prob

def json_to_matrix(prob_dict, tags):
    num_tags = len(tags)
    prob_matrix = np.zeros((num_tags, num_tags))

    for i, from_tag in enumerate(tags):
        for j, to_tag in enumerate(tags):
            key = "('{}', '{}')".format(from_tag, to_tag)
            prob_matrix[i, j] = prob_dict.get(key, 0.0)

    return prob_matrix

In [None]:
correct_predictions = 0
total_predictions = 0
# Create a mapping from words to tag indices based on your dataset's vocabulary
word_to_index = {word: index for index, word in enumerate(unique_tags_list)}

# Iterate over sentences and corresponding tags in the development data
for data_point in development_data:
    sentence = data_point['sentence']
    actual_labels = data_point['labels']  # Get the actual labels for the sentence

    # Convert the sentence to word indices using the word_to_index mapping
    observations = [word_to_index.get(word, -1) for word in sentence]

    # Check for words not in the vocabulary and handle them appropriately
    if -1 in observations:
        print("Warning: Some words in the sentence are not in the vocabulary.")
        continue  # Skip this sentence or handle it as needed

    # Perform greedy decoding with the precomputed matrices
    best_path = greedy_decode(observations, emission_prob_matrix, transition_prob_matrix, initial_prob)

    # Calculate accuracy for this sentence
    for predicted_tag, actual_tag in zip(best_path, actual_labels):
        if predicted_tag == actual_tag:
            correct_predictions += 1
        total_predictions += 1

# Calculate overall accuracy if total_predictions is not zero
if total_predictions > 0:
    accuracy = correct_predictions / total_predictions
else:
    accuracy = 0.0

print("Accuracy:", accuracy)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Accuracy: 0.0


### new


In [None]:
import json
import numpy as np

# Load the HMM parameters from the hmm.json file
with open('/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/hmm.json', 'r') as hmm_file:
    hmm_data = json.load(hmm_file)

# Extract the transition, emission, and initial probabilities from the JSON
transition_prob = np.array(hmm_data['transition'])
emission_prob = np.array(hmm_data['emission'])
initial_prob = np.array(hmm_data['initial'])


In [None]:
def greedy_decode(observations, initial_prob, transition_prob, emission_prob):
    num_states = initial_prob.shape[0]
    num_observations = len(observations)

    # Initialize the sequence of states (tags)
    sequence = []

    # Find the most likely initial state (tag)
    current_state = np.argmax(initial_prob * emission_prob[:, observations[0]])
    sequence.append(current_state)

    # Iterate over observations and choose the most likely state (tag) at each step
    for t in range(1, num_observations):
        scores = transition_prob[:, current_state] * emission_prob[:, observations[t]]
        current_state = np.argmax(scores)
        sequence.append(current_state)

    return sequence

In [None]:
# Load the development data from the dev.json file
with open('/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/dev.json', 'r') as dev_file:
    dev_data = json.load(dev_file)

# Initialize counters for correct and total predictions
correct_predictions = 0
total_predictions = 0

In [None]:
def words_to_indices(words, word_to_index):
    return [word_to_index.get(word, -1) for word in words]

In [None]:
for data_point in dev_data:
    sentence = data_point['sentence']
    actual_labels = data_point['labels']

    # Convert the sentence to word indices using the word_to_index mapping
    observations = words_to_indices(sentence, word_to_index)

    # Check for words not in the vocabulary and handle them appropriately
    if -1 in observations:
        print("Warning: Some words in the sentence are not in the vocabulary.")
        continue  # Skip this sentence or handle it as needed

    # Perform greedy decoding with the precomputed matrices
    best_path = greedy_decode(observations, initial_prob, transition_prob, emission_prob)

    # Calculate accuracy for this sentence
    for predicted_tag, actual_tag in zip(best_path, actual_labels):
        if predicted_tag == actual_tag:
            correct_predictions += 1
        total_predictions += 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
def calculate_accuracy(dev_data, initial_prob, transition_prob, emission_prob):
    correct_predictions = 0
    total_predictions = 0

    for data_point in dev_data:
        sentence = data_point['sentence']
        actual_labels = data_point['labels']

        # Perform greedy decoding with the precomputed matrices
        best_path = greedy_decode(sentence, initial_prob, transition_prob, emission_prob)

        # Calculate accuracy for this sentence
        for predicted_tag, actual_tag in zip(best_path, actual_labels):
            if predicted_tag == actual_tag:
                correct_predictions += 1
            total_predictions += 1

    # Calculate overall accuracy if total_predictions is not zero
    if total_predictions > 0:
        accuracy = correct_predictions / total_predictions
    else:
        accuracy = 0.0

    return accuracy


In [None]:
accuracy = calculate_accuracy(dev_data, initial_prob, transition_prob, emission_prob)

print("Accuracy:", accuracy)

IndexError: ignored

In [None]:
initial_prob.shape

()

In [None]:
initial_prob

array({'NNP': 0.19789104610393007, 'DT': 0.21911141347009264, 'IN': 0.1288398137003506, 'PRP': 0.06148935056779528, 'EX': 0.004238840337013972, '``': 0.07472918520069077, 'CD': 0.011225077188759224, 'RBR': 0.0020932544874143074, 'NNS': 0.041237113402061855, 'NN': 0.0411847820398765, 'JJ': 0.041708095661730074, 'JJR': 0.0017007692710241248, 'RB': 0.05604688890051808, 'WRB': 0.00609660369459417, 'CC': 0.05691035637657648, 'VBG': 0.012010047621539588, 'WDT': 0.0008111361138730441, 'VBN': 0.005834946883667382, '-LRB-': 0.003427704223140928, 'VB': 0.0030613846878434245, 'WP': 0.003113716050028782, 'PRP$': 0.007797372965618295, 'TO': 0.0035323669475116437, 'JJS': 0.00248573970380449, 'NNPS': 0.0020409231252289496, 'VBZ': 0.001517609503375373, 'VBD': 0.0007588047516876865, 'LS': 0.0009157988382437595, "''": 0.0003663195352975038, ':': 0.002799727876916636, 'VBP': 0.0003663195352975038, 'PDT': 0.0007326390705950076, 'UH': 0.0006279763462242922, 'MD': 0.0005494793029462557, '$': 0.0008634674760

In [None]:

import json

# Define the greedy_decode function
def greedy_decode(sentence, initial_prob, transition_prob, emission_prob):
    sequence = []
    current_state = None

    # Iterate through the words in the sentence
    for word in sentence:
        if current_state is None:
            # If it's the first word, find the initial state (tag)
            current_state = max(initial_prob, key=initial_prob.get)
        else:
            # Find the most likely transition state (tag) from the previous state
            current_state = max(transition_prob.get(current_state, {}), key=transition_prob.get(current_state, {}).get)

        # Find the most likely emission state (tag) for the current word
        if word in emission_prob:
            emission_state = max(emission_prob[word], key=emission_prob[word].get)
        else:
            # Handle unknown words by assigning a default emission state
            emission_state = "UNKNOWN"

        # Append the current state to the sequence
        sequence.append(emission_state)

    return sequence

# Load the HMM parameters from the hmm.json file
with open('/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/hmm.json', 'r') as hmm_file:
    hmm_data = json.load(hmm_file)

# Extract the transition, emission, and initial probabilities from the JSON
transition_prob = hmm_data['transition']
emission_prob = hmm_data['emission']
initial_prob = hmm_data['initial']

# Define a function to calculate accuracy
def calculate_accuracy(dev_data, initial_prob, transition_prob, emission_prob):
    correct_predictions = 0
    total_predictions = 0

    for data_point in dev_data:
        sentence = data_point['sentence']
        actual_labels = data_point['labels']

        # Perform greedy decoding
        predicted_labels = greedy_decode(sentence, initial_prob, transition_prob, emission_prob)

        # Calculate accuracy for this sentence
        for predicted_label, actual_label in zip(predicted_labels, actual_labels):
            if predicted_label == actual_label:
                correct_predictions += 1
            total_predictions += 1

    # Calculate overall accuracy if total_predictions is not zero
    if total_predictions > 0:
        accuracy = correct_predictions / total_predictions
    else:
        accuracy = 0.0

    return accuracy

# Load the development data from the dev.json file
with open('/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/dev.json', 'r') as dev_file:
    dev_data = json.load(dev_file)

# Calculate accuracy for the development data
accuracy = calculate_accuracy(dev_data, initial_prob, transition_prob, emission_prob)

print("Accuracy:", accuracy)

ValueError: ignored

In [None]:
transiti

{"('NNP', 'NNP')": 0.37875307160409166,
 "('NNP', ',')": 0.1386479227384422,
 "('NNP', 'CD')": 0.0192010972055546,
 "('NNP', 'VBZ')": 0.039247957026115776,
 "('NNP', 'VBG')": 0.0017715298017029544,
 "('NNP', 'NN')": 0.057649008514772275,
 "('NNP', 'WDT')": 0.000982913309331962,
 "('NNP', 'NNS')": 0.024458540488027886,
 "('NNP', 'IN')": 0.04100805760329162,
 "('NNP', 'CC')": 0.041202354420252586,
 "('NNP', 'POS')": 0.05488313617921024,
 "('NNP', '.')": 0.054654551688667924,
 "('NNP', 'VBD')": 0.06488370764043659,
 "('NNP', 'MD')": 0.011177781587519286,
 "('NNP', 'TO')": 0.007543288187896451,
 "('NNP', 'VBP')": 0.00430881764672267,
 "('NNP', ':')": 0.007040402308703355,
 "('NNP', 'RB')": 0.009234813417909596,
 "('NNP', 'JJ')": 0.008503343048174182,
 '(\'NNP\', "\'\'")': 0.002560146294073947,
 "('NNP', 'NNPS')": 0.016720955483170468,
 "('NNP', 'DT')": 0.0023201325790045144,
 "('NNP', 'JJR')": 0.00011429224527115835,
 "('NNP', 'PRP')": 0.0007200411452082976,
 "('NNP', 'VBN')": 0.0008114749

In [None]:
type(transition_prob)

dict

In [None]:
# Load the development data from dev.json
with open('/content/drive/MyDrive/Colab Notebooks/ANLP/HW2/CSCI544_HW2/data/dev.json', 'r') as dev_file:
    dev = json.load(dev_file)

dev

[{'index': 0,
  'sentence': ['The',
   'Arizona',
   'Corporations',
   'Commission',
   'authorized',
   'an',
   '11.5',
   '%',
   'rate',
   'increase',
   'at',
   'Tucson',
   'Electric',
   'Power',
   'Co.',
   ',',
   'substantially',
   'lower',
   'than',
   'recommended',
   'last',
   'month',
   'by',
   'a',
   'commission',
   'hearing',
   'officer',
   'and',
   'barely',
   'half',
   'the',
   'rise',
   'sought',
   'by',
   'the',
   'utility',
   '.'],
  'labels': ['DT',
   'NNP',
   'NNP',
   'NNP',
   'VBD',
   'DT',
   'CD',
   'NN',
   'NN',
   'NN',
   'IN',
   'NNP',
   'NNP',
   'NNP',
   'NNP',
   ',',
   'RB',
   'JJR',
   'IN',
   'VBN',
   'JJ',
   'NN',
   'IN',
   'DT',
   'NN',
   'NN',
   'NN',
   'CC',
   'RB',
   'PDT',
   'DT',
   'NN',
   'VBN',
   'IN',
   'DT',
   'NN',
   '.']},
 {'index': 1,
  'sentence': ['The',
   'ruling',
   'follows',
   'a',
   'host',
   'of',
   'problems',
   'at',
   'Tucson',
   'Electric',
   ',',
   'including'

In [None]:
transition_params

defaultdict(float,
            {('NNP', 'NNP'): 0.37875307160409166,
             ('NNP', ','): 0.1386479227384422,
             ('NNP', 'CD'): 0.0192010972055546,
             ('NNP', 'VBZ'): 0.039247957026115776,
             ('NNP', 'VBG'): 0.0017715298017029544,
             ('NNP', 'NN'): 0.057649008514772275,
             ('NNP', 'WDT'): 0.000982913309331962,
             ('NNP', 'NNS'): 0.024458540488027886,
             ('NNP', 'IN'): 0.04100805760329162,
             ('NNP', 'CC'): 0.041202354420252586,
             ('NNP', 'POS'): 0.05488313617921024,
             ('NNP', '.'): 0.054654551688667924,
             ('NNP', 'VBD'): 0.06488370764043659,
             ('NNP', 'MD'): 0.011177781587519286,
             ('NNP', 'TO'): 0.007543288187896451,
             ('NNP', 'VBP'): 0.00430881764672267,
             ('NNP', ':'): 0.007040402308703355,
             ('NNP', 'RB'): 0.009234813417909596,
             ('NNP', 'JJ'): 0.008503343048174182,
             ('NNP', "''"): 0.0025

In [None]:
tag_frequencies={}
for data_point in dev:
    actual_labels = data_point['labels']

    # Count the occurrences of each tag in the sentence
    for tag in actual_labels:
        if tag in tag_frequencies:
            tag_frequencies[tag] += 1
        else:
            tag_frequencies[tag] = 1

# Find the tag with the highest frequency
most_frequent_tag = max(tag_frequencies, key=tag_frequencies.get)

print("Most frequently used tag:", most_frequent_tag)
print("Frequency:", tag_frequencies[most_frequent_tag])

Most frequently used tag: NN
Frequency: 18538


In [None]:

# Extract transition, emission, and initial probabilities
transition_prob =transition_params
emission_prob =emission_params
initial_prob =initial_probabilities


unique_states = list(transition_prob.keys())

correct_pred =0
total_pred= 0

for data in dev:
    sentence = data['sentence']
    actual_labels = data['labels']
    # print(sentence)
    pred = []


    # init state for each sent
    current_state = None

    for word in sentence:

        if current_state is None:
            # init max prob
            current_state = max(initial_prob, key=initial_prob.get)
        else:
            # Find the most likely transition state (tag) from the previous state using dictionary operations
            max_transition_prob = 0.0
            transition_state = None
            for state in unique_states:
                transition_state_prob = transition_prob.get((current_state, state), 0.0)
                if transition_state_prob > max_transition_prob:
                    max_transition_prob = transition_state_prob
                    transition_state = state
            current_state = transition_state

        max_emission_prob = 0.0
        emission_state = None

        for state in unique_tags_list:
            # print(state)
            word_emission_key = f"('{state}', '{word}')"
            # print(f"('{state}', '{word}')")
            if word_emission_key in emission_prob:
                current_emission_prob = transition_prob[current_state][state] * emission_prob[f"('{state}', '{word}')"]
                 #for the particular state, which state is best(trans) and for that word, which word is best(emm)
                if current_emission_prob > max_emission_prob:
                    max_emission_prob = current_emission_prob
                    emission_state = state
        # print(emission_state)
        if max_emission_prob == 0:
           emission_state = most_frequent_tag
        pred.append(emission_state)

    #  comparing predicted and actual labels
    for pred_label, actual_label in zip(pred, actual_labels):
        if pred_label == actual_label:
            correct_predictions += 1
        total_predictions += 1

print(correct_predictions)
accuracy = correct_predictions / total_predictions
print("Accuracy:", accuracy)

18538
Accuracy: 0.036963335752625996


In [None]:
transition_prob

defaultdict(float,
            {('NNP', 'NNP'): 0.37875307160409166,
             ('NNP', ','): 0.1386479227384422,
             ('NNP', 'CD'): 0.0192010972055546,
             ('NNP', 'VBZ'): 0.039247957026115776,
             ('NNP', 'VBG'): 0.0017715298017029544,
             ('NNP', 'NN'): 0.057649008514772275,
             ('NNP', 'WDT'): 0.000982913309331962,
             ('NNP', 'NNS'): 0.024458540488027886,
             ('NNP', 'IN'): 0.04100805760329162,
             ('NNP', 'CC'): 0.041202354420252586,
             ('NNP', 'POS'): 0.05488313617921024,
             ('NNP', '.'): 0.054654551688667924,
             ('NNP', 'VBD'): 0.06488370764043659,
             ('NNP', 'MD'): 0.011177781587519286,
             ('NNP', 'TO'): 0.007543288187896451,
             ('NNP', 'VBP'): 0.00430881764672267,
             ('NNP', ':'): 0.007040402308703355,
             ('NNP', 'RB'): 0.009234813417909596,
             ('NNP', 'JJ'): 0.008503343048174182,
             ('NNP', "''"): 0.0025

In [None]:
emission_params[('NNP', 'Delbert')]

1.1414482695644234e-05

In [None]:
initial_probabilities

defaultdict(float,
            {'NNP': 0.19789104610393007,
             'DT': 0.21911141347009264,
             'IN': 0.1288398137003506,
             'PRP': 0.06148935056779528,
             'EX': 0.004238840337013972,
             '``': 0.07472918520069077,
             'CD': 0.011225077188759224,
             'RBR': 0.0020932544874143074,
             'NNS': 0.041237113402061855,
             'NN': 0.0411847820398765,
             'JJ': 0.041708095661730074,
             'JJR': 0.0017007692710241248,
             'RB': 0.05604688890051808,
             'WRB': 0.00609660369459417,
             'CC': 0.05691035637657648,
             'VBG': 0.012010047621539588,
             'WDT': 0.0008111361138730441,
             'VBN': 0.005834946883667382,
             '-LRB-': 0.003427704223140928,
             'VB': 0.0030613846878434245,
             'WP': 0.003113716050028782,
             'PRP$': 0.007797372965618295,
             'TO': 0.0035323669475116437,
             'JJS': 0.00248573