In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Capstone/ASLParser

In [None]:
import spacy
import json
import string
import random
import json
from spacy.training import Example
import re
!pip install Levenshtein
import Levenshtein

In [None]:
data = []
with open("EngToASLPairs.txt", "r") as f:
    lines = f.readlines()

# Remove newline characters and filter out empty lines
lines = [line.strip() for line in lines if line.strip()]

# Group English and ASL lines into pairs
for i in range(0, len(lines), 2):
    if i + 1 < len(lines):
        data.append({"english": lines[i], "asl": lines[i + 1]})


In [None]:
data

In [None]:
# Load English model
nlp_en = spacy.load("en_core_web_sm")

In [None]:
def preprocess_asl_token(asl_token_text):
    asl_token_text = re.sub(r'[^\w\s]', '', asl_token_text)
    return asl_token_text

In [None]:
def find_best_matching_asl_token(english_token_text, asl_tokens_texts, threshold=1):
    min_distance = float('inf')
    best_match = None

    # Lowercase and lemmatize English token
    english_lemma = nlp_en(english_token_text.lower())[0].lemma_

    for asl_token_text in asl_tokens_texts:
        # Lowercase and lemmatize ASL token using English lemmatization rules
        asl_lemma = nlp_en(asl_token_text.lower())[0].lemma_

        distance = Levenshtein.distance(english_lemma, asl_lemma)
        if distance < min_distance:
            min_distance = distance
            best_match = asl_token_text
    if min_distance <= threshold:
        return best_match
    else:
        return None


In [None]:
# Parse English sentences and create ASL training data
asl_training_data = []
num_processed_pairs = 0
num_pairs = len(data)
for pair in data:
    english_doc = nlp_en(pair["english"])
    # for token in english_doc:
    #     print(f"Token: {token.text}, POS: {token.pos_}, DEP: {token.dep_}")
    asl_tokens_texts = preprocess_asl_token(pair["asl"]).split()

    asl_deps = [None] * len(asl_tokens_texts)
    asl_pos = [None] * len(asl_tokens_texts)

    for token in english_doc:
        # Find the corresponding ASL token
        asl_token = find_best_matching_asl_token(token.text, asl_tokens_texts)

        if asl_token:
            asl_token_index = asl_tokens_texts.index(asl_token)
            asl_deps[asl_token_index] = token.dep_
            asl_pos[asl_token_index] = token.pos_

    asl_training_data.append({
        "text": preprocess_asl_token(pair["asl"]),
        "deps": asl_deps,
        "pos": asl_pos
    })

    num_processed_pairs += 1
    if num_processed_pairs % 50 == 0:
      percent_complete = num_processed_pairs / num_pairs * 100
      print(f"Processed {num_processed_pairs} out of {num_pairs} pairs ({percent_complete:.2f}% complete)")


In [None]:
asl_training_data

In [None]:
# Save ASL training data to JSON
with open("asl_training_data.json", "w") as f:
    json.dump(asl_training_data, f)

## Train parser


In [None]:
import spacy
import random
from spacy.util import minibatch, compounding
import json

In [None]:
# Load the dataset
with open("asl_training_data.json", "r") as f:
    dataset = json.load(f)

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
examples = []
for entry in dataset:
    doc = nlp(entry['text'])
    # print(entry)
    entry['pos'] = [p if p is not None else '-' for p in entry['pos']]
    entry['deps'] = [p if p is not None else '-' for p in entry['deps']]
    try:
      examples.append(Example.from_dict(doc, {'DEP': entry['deps'], 'POS': entry['pos']}))
    except:
      print(entry)
      continue


In [None]:
len(examples)

In [None]:
!pip install spacy-lookups-data
!python -m spacy download en
!python -m spacy lookups download en


In [None]:
new_dataset = dataset*3

In [None]:
import spacy
from spacy.training import Example
import random

nlp = spacy.blank("en")
config = {
    "min_action_freq": 5
}
nlp.add_pipe("parser", config=config)


# Define the other pipes to disable during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]

# Disable other pipes and begin training
with nlp.disable_pipes(*other_pipes):
    # Begin training
    optimizer = nlp.begin_training()
    for i in range(10):
        # Shuffle the training data
        random.shuffle(new_dataset)
        for data in new_dataset:
            data['pos'] = [p if p is not None else '-' for p in data['pos']]
            data['deps'] = [p if p is not None else '-' for p in data['deps']]
            text = data["text"]
            deps = data["deps"]
            pos = data["pos"]

            # Create a spacy Doc object from the text
            doc = nlp.make_doc(text)
            # Create an Example object from the Doc and annotations
            example = Example.from_dict(doc, {"deps": deps, "pos": pos})
            print(example)
            # Update the parser with the Example
            nlp.update([example], sgd=optimizer)


In [None]:
examples[1]