Sentence Classifier

In [151]:
import numpy as np
import pandas as pd
import csv
import nltk
import re
import os

In [152]:
termsFile = open("ProcessTerms.csv","r")
termsList = [x[0] for x in list(csv.reader(termsFile, delimiter=","))]
termsFile.close()

print(termsList[:5])
print(termsList[-5:])

['access', 'accession', 'accretion', 'accrual', 'acquisition']
['work', 'working files', 'workstation', 'writ', 'write']


In [153]:
textFile = open("0.txt","r")
textStr = textFile.readlines()[0]
textFile.close()

print(textStr)

the henriette d. avram marc development collection contains items originating primarily during the period from 1966 to 1976 although there are some materials from the early 1960s, late 1970s, and early 1980s. the contents of the collection are described below in the context of the major accomplishments of the program and the resulting publications. the following essay documents the provenance and organization of the collection, creation and development of the marc standard, and automation activities in the library of congress. it is reproduced from lenore maruyama's "the marc archives: a register of records in the library of congress" (may 8, 1987, 74 pp.), which is available in the manuscript reading room. it has been revised to conform to current standards for finding aids in the manuscript division. office files covering the marc pilot project and the activities of the marc development office through the early 1970s. these items had been placed in storage with the central services d

In [154]:
def removePunctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def removeNumbers(text):
    return re.sub(r'[0-9]', '', text)

def removeAll(text):
    return removePunctuation(removeNumbers(text))

def sentenceSplit(text):
    text = text.replace('pp.','pp') # So that "pp." doesn't trigger a sentence split
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    return [removePunctuation(removeNumbers(sentence)) for sentence in tokenizer.tokenize(text)]

sentenceSplit(textStr)[:5]

['the henriette d avram marc development collection contains items originating primarily during the period from  to  although there are some materials from the early s late s and early s',
 'the contents of the collection are described below in the context of the major accomplishments of the program and the resulting publications',
 'the following essay documents the provenance and organization of the collection creation and development of the marc standard and automation activities in the library of congress',
 'it is reproduced from lenore maruyamas the marc archives a register of records in the library of congress may    pp which is available in the manuscript reading room',
 'it has been revised to conform to current standards for finding aids in the manuscript division']

In [155]:
sents = sentenceSplit(textStr)[:5]

def tokenize(text):
    return [x for x in text.split(' ') if x != '']

tokenize(sents[3])

['it',
 'is',
 'reproduced',
 'from',
 'lenore',
 'maruyamas',
 'the',
 'marc',
 'archives',
 'a',
 'register',
 'of',
 'records',
 'in',
 'the',
 'library',
 'of',
 'congress',
 'may',
 'pp',
 'which',
 'is',
 'available',
 'in',
 'the',
 'manuscript',
 'reading',
 'room']

In [156]:
from lbl2vec import Lbl2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

In [157]:
def txtFilepath(num):
    return os.getcwd()[:-10]+'txtfiles\\'+str(num)+'.txt'

In [164]:
docs_formatted = []
docs_numberless = []

for doc in range(919):
    file = open(txtFilepath(doc),"r")
    fileRead = file.read()
    docs_numberless.append(removeNumbers(fileRead))
    docs_formatted.append(removeAll(fileRead))
    file.close()



In [166]:
Vectorizer = TfidfVectorizer(stop_words='english')
X = Vectorizer.fit_transform(docs_formatted)

In [167]:

wordFrequencies=pd.DataFrame(X.toarray(), columns=Vectorizer.get_feature_names()).sort_values(by=0,axis=1,ascending=False)
wordFrequencies.columns[:5]
wordFrequencies.shape

(919, 48312)

In [168]:
termsRemoved = 0
for term in termsList:
    try:
        wordFrequencies = wordFrequencies.drop([term],axis=1)
        termsRemoved += 1
    except:
        pass

print(str(termsRemoved)+" terms removed")


199 terms removed


In [169]:
wordFrequencies.shape

(919, 48113)

In [170]:
def tfidfTerms(df, doc, amt):
    sorted = df.sort_values(by=doc,axis=1,ascending=False)
    return list(sorted.columns[:amt])

In [171]:
tfidfTerms(wordFrequencies,0,10)

['marc',
 'office',
 'automation',
 'development',
 'systems',
 'pilot',
 'files',
 'items',
 'project',
 'avram']

In [183]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentenceSplit(docs_numberless[0]))]
documents

[TaggedDocument(words='the henriette d avram marc development collection contains items originating primarily during the period from  to  although there are some materials from the early s late s and early s the contents of the collection are described below in the context of the major accomplishments of the program and the resulting publications', tags=[0]),
 TaggedDocument(words='the following essay documents the provenance and organization of the collection creation and development of the marc standard and automation activities in the library of congress', tags=[1]),
 TaggedDocument(words='it is reproduced from lenore maruyamas the marc archives a register of records in the library of congress may    pp which is available in the manuscript reading room', tags=[2]),
 TaggedDocument(words='it has been revised to conform to current standards for finding aids in the manuscript division', tags=[3]),
 TaggedDocument(words='office files covering the marc pilot project and the activities of

In [177]:
model = Lbl2Vec(keywords_list = [termsList, tfidfTerms(wordFrequencies,0,10)], tagged_documents=documents)
model.fit()

2023-03-17 17:47:07,550 - Lbl2Vec - INFO - Train document and word embeddings
2023-03-17 17:47:07,550 - Lbl2Vec - INFO - Train document and word embeddings
2023-03-17 17:47:07,550 - Lbl2Vec - INFO - Train document and word embeddings
2023-03-17 17:47:07,610 - Lbl2Vec - INFO - Train label embeddings
2023-03-17 17:47:07,610 - Lbl2Vec - INFO - Train label embeddings
2023-03-17 17:47:07,610 - Lbl2Vec - INFO - Train label embeddings


ValueError: cannot compute mean with no input

In [191]:
sentenceSplit(docs_numberless[0])

def termCheck(terms, sentences):
    contains = [0]*len(sentences)
    for term in terms:
        for i, sentence in enumerate(sentences):
            if term in sentence:
                contains[i]=1
    
    return pd.DataFrame(np.array([sentences,contains]).transpose(), columns = ['sentence','contains'])


In [211]:
np.array(termCheck(termsList, sentenceSplit(docs_numberless[0]))['contains']).astype(int).mean()

0.9318181818181818

# Plan:

1. Generate N per-FA keywords via TF-IDF for the paper within the whole corpus (adjust N as hyperparameter? start around 10)
2. Use pre-existing keywords as archiving practice identifiers (try with all of them, or maybe just most common N if that has trouble training)
3. Test Lbl2Vec using sentences as documents in this case