<a href="https://colab.research.google.com/github/GreihMurray/NLP-3/blob/Semi_Murray/semisupervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import joblib
from sklearn.metrics import accuracy_score
import pickle
import nltk
from sklearn.model_selection import train_test_split
import json
from sklearn.semi_supervised import LabelPropagation
from numpy import concatenate

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Original

In [3]:
def read_file_to_sents():
    all_data = []
    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/train.tsv", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")
        for line in tqdm(f, desc="Reading data..."):
            word = line[0]
            graphemes = line[1].split('-')

            cur_word = []

            for i in range(0, len(graphemes)):
                for j in range(0, len(graphemes[i])):
                    if j == 0:
                        cur_word.append((graphemes[i][j], 'B'))
                    else:
                        cur_word.append((graphemes[i][j], 'I'))

            all_data.append(cur_word)

    return all_data

In [4]:
def read_test_data():
    all_data = []
    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/test.txt", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")
        for line in tqdm(f, desc="Reading data..."):
            word = line[0]

            all_data.append([*word])

    print(len(all_data))

    return all_data

Original

In [5]:
def split_data(data):
    split_x = []
    split_y = []
    
    for word in data:
        cur_x = []
        cur_y = []
        for letter in word:
            cur_x.append(letter[0])
            cur_y.append(letter[1])

        split_x.append(cur_x)
        split_y.append(cur_y)

    return split_x, split_y

original

In [6]:
def combine_all_y(y):
    all_y = []

    for entry in y:
        for letter in entry:
            all_y.append(letter)

    return all_y

original

In [7]:
def split_out(x, y):
    fixed_y = []

    track_y = 0

    for j in range(0, len(x)):
        cur_word = []
        for i in range(0, len(x[j])):
            cur_word.append(y[track_y])
            track_y += 1
        fixed_y.append(cur_word)

    return fixed_y
        

Original

In [8]:
def calc_acc(preds, y_test):
    total_right = 0

    total_size = 0

    for i in range(0, len(preds)):
        for j in range(0, len(preds[i])):
          if preds[i][j] == y_test[i][j]:
              total_right += 1
          total_size += 1

    accuracy = 100 * (total_right/total_size)

    return accuracy

Original

In [9]:
def calc_rec(preds, y_test):
    true_pos = 0
    false_neg = 0

    for i in range(0, len(preds)):
        for j in range(0, len(preds[i])):
            if preds[i][j] == 'I' and y_test[i][j] == 'I':
                true_pos += 1
            if preds[i][j] == 'B' and y_test[i][j] == 'I':
                false_neg += 1

    if true_pos + false_neg == 0:
        return 0
        
    recall = 100 * (true_pos / (true_pos + false_neg))

    return recall

Original

In [10]:
def calc_prec(preds, y_test):  
    true_pos = 0
    false_pos = 0

    for i in range(0, len(preds)):
        for j in range(0, len(preds[i])):
            if preds[i][j] == 'I' and y_test[i][j] == 'I':
                true_pos += 1
            if preds[i][j] == 'I' and y_test[i][j] == 'B':
                false_pos += 1

    if (true_pos + false_pos) == 0:
        return 0.01

    precision = 100 * (true_pos / (true_pos + false_pos))

    return precision

Original

In [11]:
def custom_eval(preds, y_clean):
    acc = calc_acc(preds, y_clean)

    print("Custom calculated Accuracy: ", acc)

    prec = calc_prec(preds, y_clean)

    print("Precision: ", prec)

    recall = calc_rec(preds, y_clean)

    print("Recall: ", recall)

    fscore = (2 * (prec * recall)) / (prec + recall)

    print("Fscore: ", fscore)

Based on code from https://machinelearningmastery.com/semi-supervised-learning-with-label-propagation/

In [12]:
def semi_supervised():
    data = read_file_to_sents()

    x, y = split_data(data)

    x_test = read_test_data()

    x_train, _, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 50)
    clean_y_test = y_test
    clean_x_test = x_test

    del _, x, y, y_test

    x_lab = combine_all_y(x_train[:650])
    y_lab = combine_all_y(y_train[:650])
    x_train = combine_all_y(x_train[650:])

    le = LabelEncoder()

    le.fit(y_lab)
    y_hold = le.transform(y_lab)

    x_train_mixed = concatenate((x_lab, x_train))
    fle = LabelEncoder()
    fle.fit(x_train_mixed)
    x_train_mixed = fle.transform(x_train_mixed)
    x_train_mixed = x_train_mixed.reshape(-1, 1)

    del x_lab, x_train

    nolabel = [-1 for _ in range(len(y_hold), len(x_train_mixed))]

    y_train_mixed = concatenate((y_hold, nolabel))

    del nolabel, y_hold

    model = LabelPropagation(max_iter=1000, tol=0.1, n_jobs = -1, kernel='rbf', gamma=50)

    model.fit(x_train_mixed, y_train_mixed)

    x_test = combine_all_y(x_test)
    x_test = fle.transform(x_test)
    x_test = x_test.reshape(-1, 1)

    y_hat = model.predict(x_test)
    # y_test = combine_all_y(y_test)
    # y_test = le.transform(y_test)
    # y_test = y_test.reshape(-1, 1)

    # score = accuracy_score(y_test, y_hat)
    # print(score*100)

    clean_preds = le.inverse_transform(y_hat)

    clean_preds = split_out(clean_x_test, clean_preds)

    del y_hat

    # custom_eval(clean_preds, clean_y_test)
    clean_preds = make_words(clean_preds)

    clean_x_test = make_words(clean_x_test)

    combined = combine_data(clean_x_test, clean_preds)

    graphs = to_graphemes(combined)

    print_results_to_file(graphs)

Original

In [13]:
def make_words(data):
    all_data = []

    for word in data:
        cur_word = []
        for letter in word:
            cur_word.append(letter)

        all_data.append(''.join(cur_word))

    return all_data

Original

In [14]:
def combine_data(x, y):
    all_data = []

    for i in range(0, len(x)):
        all_data.append((x[i], y[i]))

    return all_data

Original

In [15]:
def to_graphemes(data):
    graph_data = []

    for word_pair in data:
        word = word_pair[0]
        grap = word_pair[1]

        cur_word = []

        for i in range(0, len(word)):
            if i == (len(word) - 1):
                cur_word.append(word[i])

            else:
                if grap[i+1] == 'I':
                    cur_word.append(word[i])
                else:
                    cur_word.append(word[i] + '-')

        graph_data.append((word, ''.join(cur_word)))

    return graph_data

Original

In [16]:
def print_results_to_file(data):
    with open('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/semi.tsv', 'w', newline='') as tsvfile:
      writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')

      for row in data:
          writer.writerow(row)

Original

In [17]:
semi_supervised()

Reading data...: 12812it [00:00, 63424.87it/s]
Reading data...: 1427it [00:00, 484206.12it/s]


1427
