In [1]:
import spacy
import os

from mars.definition_extraction import DeftCorpusLoader
import pandas as pd
import numpy as np
from sklearn import metrics



In [2]:
os.chdir('../..')

In [None]:
STORAGE_PATH = "mars/definition_extraction/deft_corpus/data"

deft_loader = DeftCorpusLoader(STORAGE_PATH)
trainframe, devframe = deft_loader.load_classification_data(preprocess=True, clean=True)
testframe = deft_loader.load_test_data()

In [4]:
nlp = spacy.load('mars/definition_extraction/output/model-best')

In [5]:
def get_preds(frame):
    probs = []
    frame = frame.reset_index(drop=True)
    for i in range(len(frame)): 
        doc = nlp(frame['Sentence'][i])
        probs.append(doc.cats['Definition'])
    return np.array(probs)

In [6]:
preds = get_preds(testframe)

In [7]:
preds = np.array(preds)

In [8]:
y_test = testframe['HasDef']

In [9]:
print(pd.DataFrame(metrics.confusion_matrix(y_test, np.round(preds))))

     0    1
0  507   73
1  128  151


In [10]:
print(metrics.classification_report(y_test, np.round(preds)))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       580
           1       0.67      0.54      0.60       279

    accuracy                           0.77       859
   macro avg       0.74      0.71      0.72       859
weighted avg       0.76      0.77      0.76       859



In [11]:
print(metrics.precision_score(y_test, np.round(preds)))

0.6741071428571429


In [12]:
print(metrics.recall_score(y_test, np.round(preds)))

0.5412186379928315


In [None]:
print(metrics.auc(y_test, np.round(preds)))

In [11]:
def check_acc(frame):
    acc = 0
    frame = frame.reset_index(drop=True)
    for i in range(len(frame)): 
        doc = nlp(frame['Sentence'][i])
        is_def = doc.cats['Definition']
        if is_def > 0.5 and frame['HasDef'][i] == 1: 
            acc += 1
        if is_def <= 0.5 and frame['HasDef'][i] == 0: 
            acc += 1
            
    return acc/len(frame)

In [12]:
check_acc(testframe)

0.7566938300349243

In [None]:
nlp()

In [15]:
! ls 

wiki_bad.txt  wiki_good.txt


In [3]:
path = 'mars/definition_extraction/wcl_datasets_v1.2/wikipedia/'

In [4]:
import os

In [5]:
os.path.join(path, 'wiki_bad.txt')

'mars/definition_extraction/wcl_datasets_v1.2/wikipedia/wiki_bad.txt'

In [6]:
files = ('wiki_bad.txt', 'wiki_good.txt')

In [7]:
path = 'mars/definition_extraction/wcl_datasets_v1.2/wikipedia/'

In [29]:

files = None

In [34]:
def create_from_wiki(path = 'mars/definition_extraction/wcl_datasets_v1.2/wikipedia/', files = None):
    
    if files is None: 
        files = {'wiki_bad.txt':0, 'wiki_good.txt':1}

    file_sentences = {} 

    for f in files.keys(): 
        filename = os.path.join(path, f)
        defs = []
        with open(filename, 'rb') as file:
            lines = file.readlines()
            lines = np.array([line.rstrip() for line in lines])

        for line in lines[1::2]:
            defs.append(str(line)[2:].split(':')[0])

        lines = lines[::2]
        for i, line in enumerate(lines): 
            lines[i] = str(line)[4:-2]
        lines = lines.astype(str)

        for i, line in enumerate(lines): 
            lines[i] = lines[i].replace('TARGET', defs[i])

        lines = list(lines)
        lines = [(line, files[f]) for line in lines]

        file_sentences[f] = lines

    file_sentences = list(file_sentences.values())[0] + list(file_sentences.values())[1] 
    return file_sentences

In [35]:
create_from_wiki()

[('!Hawk has not been seen since Season ', 0),
 ('Other colonies in West !Marredpally include Indrapuri railway colony, Krishnapuri colony, SarvaSukhi Colony, and Samrat Colony',
  0),
 ('The premise of the program revolves around !Tammy Parker, an 18-year-old country girl who moves back and forth between her country family, who lives on a bayou houseboat, and the wealthy Brents, who own a plantation and pancake business',
  0),
 ('He also answers to the questions asked from the readers in Hi !Madhan in Anada Vikatan which is very informative',
  0),
 ('The correct procedure with this !Malfunction is to deploy the reserve from a stable belly down position',
  0),
 ('The !Kufi has no religious significance', 0),
 ('Sometimes the player must help trapped individuals escape as well, and it will take quick thinking and planning ahead to !Exit the building successfully',
  0),
 ('The !Bartians, along with the other Prussians, were conquered by the Teutonic Knights, who Christianized them, b