<a href="https://colab.research.google.com/github/psriraj17/NLP-3/blob/main/semi_supervised_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
import torch
from math import log
import numpy as np
torch.set_printoptions(precision=10)

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

In [33]:
train_file = '/content/drive/MyDrive/train_semi.tsv'
val_split = 0.95

states = {
    '0' :0,
    '1' :1,
}

In [34]:
def load_data(file):
  print("Loading data from file {}...".format(file))
  file = open(file, 'r')
  data = []
  for line in file:
      pieces = line.rstrip("\n").split("\t")
      data.append(pieces)
  print("Loaded {} sentences".format(len(data)))
  return data


In [35]:
train_data = list(load_data(train_file))

Loading data from file /content/drive/MyDrive/train_semi.tsv...
Loaded 12812 sentences


In [36]:
print("Splitting data...")
num_train_samples = int(len(train_data)*(1-val_split))
val_data = train_data[num_train_samples:]
print(len(val_data),'validation set')
train_data = train_data[:num_train_samples]
print(len(train_data),"training set")

Splitting data...
12172 validation set
640 training set


In [32]:
print(train_data[0])

['konsiltan', 'k-on-s-i-l-t-an']


In [38]:
def compute_probabilities_from_counts(counts_dict):
    counts_sum = sum(counts_dict.values())
    probabilities_dict = {}
    for count_id in counts_dict:
        count = counts_dict[count_id]
        probabilities_dict[count_id] = count / counts_sum
    assert round(sum(probabilities_dict.values()), 2) == 1.0, "All probabilities should sum to 1 but got {}".format(round(sum(probabilities_dict.values()), 2))
    return probabilities_dict

In [39]:
def key_with_max_val(d):
    """https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary"""
    v = list(d.values())
    k = list(d.keys())
    return k[v.index(max(v))]

In [40]:
def generate_initial_state_probabilities(data):
    initial_state_counts = states.copy()
    # equal probability of starting
    for state in initial_state_counts:
        initial_state_counts[state] += 1
    initial_state_probabilities = compute_probabilities_from_counts(initial_state_counts)
    return initial_state_probabilities

In [41]:
def generate_transition_state_probabilities(data):
    # create a dictionary with two levels, the first being the previous state and the second being the current state
    transition_state_counts = {state: states.copy() for state in states}
    # since we enumerate over a list that excludes the first item, the enumeration index is one behind
    for prev_idx, word in enumerate(data[1:]):
        prev_state = data[prev_idx][1]
        current_state = word[1]
        if prev_state in transition_state_counts and current_state in transition_state_counts[prev_state]:
            transition_state_counts[prev_state][current_state] += 1
    # setting STOP count to 1 for all states to avoid zeros
    for state in transition_state_counts:
        transition_state_counts[state]['STOP'] = 1
    transition_state_probabilities = {state: {} for state in states}
    for prev_state in transition_state_counts:
        transition_state_probabilities[prev_state] = compute_probabilities_from_counts(transition_state_counts[prev_state])
    return transition_state_probabilities

In [42]:
def generate_emission_probabilities(data, all_observations):
    vocab = {obs: 1 for obs in set(all_observations)}
    emission_counts_by_state = {state: vocab for state in states}
    for word_state_pair in data:
        word, state = word_state_pair
        if state in emission_counts_by_state:
            # initialize word in state dict if the first occurrence of word X in state Y
            if word not in emission_counts_by_state[state]:
                emission_counts_by_state[state][word] = 0
            emission_counts_by_state[state][word] += 1
    emission_probabilities_by_state = {state: {} for state in states}
    for state in emission_counts_by_state:
        emission_probabilities_by_state[state] = compute_probabilities_from_counts(emission_counts_by_state[state])
    return emission_probabilities_by_state

In [43]:
def fit(data, vocab):
        print("Fitting model to provided dataset...")
        initial_state_probabilities = generate_initial_state_probabilities(data)
        transition_probabilities = generate_transition_state_probabilities(data)
        emission_probabilities = generate_emission_probabilities(data, vocab)
        print("Model ready.")
        return initial_state_probabilities, transition_probabilities, emission_probabilities

In [49]:
val_sequence = [pair[0] for pair in val_data]
val_labels = [pair[1] for pair in val_data]
initial, transition, emission  = fit(train_data, val_sequence)

Fitting model to provided dataset...
Model ready.


In [46]:
def get_forward_prob(observations):
    forward_probabilities = {}
    for idx, observation in enumerate(observations):
        for state in initial:
            if idx == 0:
                probability = initial[state]*emission[state][observation] if observation in emission[state] else 0
                forward_probabilities[state] = torch.DoubleTensor([probability])
            else:
                probability = 0
                for prev_state in transition:
                    probability += forward_probabilities[prev_state][idx-1]*transition[prev_state][state]
                probability *= emission[state][observation]
                forward_probabilities[state] = torch.cat((forward_probabilities[state], torch.DoubleTensor([probability])))
    return forward_probabilities