# Read in Word Doc

Grab the text from a word document

In [None]:
import re
from typing import Callable
from functools import reduce, partial

from docx import Document
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

Util functions

In [None]:
def pipeline(*functions: Callable) -> Callable:
    '''
    create a callable pipeline of functions; 
    functions f, g,...n become a single callable of  n(...(g(f(x))))
    '''
    return reduce(lambda f, g: lambda x: g(f(x)), functions)

def flatten(x: list) -> list:
    return sum(map(flatten, x), []) if isinstance(x, list) else [x]


def clean(text: str) -> list[str]:
    # pipeline funcs
    tokenize = lambda x: nltk.word_tokenize(x)
    
    # regexes
    url_re = lambda s: s if s and re.match(('(https?:\/\/)?([\w\-])+\.{1}'
                                         '([a-zA-Z]{2,63})([\/\w-]*)*\/?\??'
                                         '([^#\n\r]*)?#?([^\n\r]*)'), s) is None else "URL"
    uname_re = lambda s: s if s and re.match(r'^@\S+', s) is None else "SCREEN_NAME"
    hashtag_re = lambda s: s if s and re.match(r'^#\S+', s) is None else "HASHTAG"
    
    # filters/maps
    url = lambda x: map(url_re, x)
    uname = lambda x: map(uname_re, x)
    hashtag = lambda x: map(hashtag_re, x)
    lower = lambda x: map(lambda s: s.lower(), x)
    len_4 = lambda x: filter(lambda wd: len(wd) > 4, x)
    #frontslash = lambda x: map(lambda s: s.split('/'), x) # this is dumb
    
    f = pipeline(tokenize, lower, url, uname, hashtag, len_4)
    
    return list(f(text))
    
def lemmatize(tokens: list[str]) -> list[str]:
    # whole-token related
    en_stop = lambda x: filter(lambda wd: wd not in stopwords.words('english'), x)
    lemmywinks = lambda x: map(lambda wd: wn.morphy(wd) or wd, x)
        
    f = pipeline(en_stop, lemmywinks)
    
    return list(f(tokens))

def get_document_words(path: str) -> list[str]:
    'get the words from a word document and return as a list of tokens'
       
    doc = Document(path)
    
    lines = [list(clean(para.text)) for para in doc.paragraphs]
    
    return list(lemmatize(flatten(lines)))

In [None]:
get_document_words("Luke Chambers FT Resume RES-2020-00386.docx")[5:25]

# Exploring the Data

the data set is labeled!

In [None]:
import pandas as pd

df = pd.read_csv('UpdatedResumeDataSet.csv')

In [None]:
df.iloc[:500:50]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

plt.figure(figsize=(15,8))
sns.barplot(x=df['Category'].value_counts(),
            y=df['Category'].value_counts().index,
            palette='icefire_r');


# Topic Modeling

In [None]:
def prep_text(text: str) -> list[str]:
    tokens = clean(text)
    return lemmatize(tokens)

In [None]:
import tqdm.notebook

text_data = []
for text in tqdm.notebook.tqdm_notebook(df['Resume']):
    text_data.append(prep_text(text))

Create dictionary and corpus from the resume text data -- resumable from here for time's sake.

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [None]:
import pickle

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

Convenience cell for reload on restart

In [None]:
#import os, pickle

#if not globals().get('dictionary'):
#    if os.path.exists('corpus.pkl') and os.path.exists('dictionary.gensim'):
#        corpus = pickle.load(open('corpus.pkl', 'rb'))
#        dictionary = corpora.Dictionary().load("dictionary.gensim")

Train the model

In [None]:
import gensim

NUM_TOPICS = len(df['Category'].unique())

lda_model = gensim.models.ldamodel.LdaModel(
    corpus,
    num_topics=15,
    id2word=dictionary,
    passes=25
)

In [None]:
topics = lda_model.print_topics(num_words=3)
for topic in sorted(topics, key=lambda k: k[0]):
    print(topic)

What topic is *my* resume?

In [None]:
new_doc = get_document_words('Luke Chambers FT Resume RES-2020-00386.docx')

new_doc_bow =dictionary.doc2bow(new_doc)

In [None]:
sorted(lda_model.get_document_topics(new_doc_bow), key=lambda k: k[1], reverse=True)

# Mentioning ATB

just for funzies

In [None]:
from ATB.atb import ATB

atb = ATB()

In [None]:
import re
import json

def make_vocabulary() -> set:
    '''
    Create a term vocabulary for ingest and comparisons
    '''
    words = {w for word in open('words.txt', 'r').readlines()
            if (w:=re.sub('[\W]+', '', word.strip().lower()))}

    packs = {d.get('project').lower()
             for d in json.loads(open('pypi.json').read())['rows']}
    packs = packs.union({t.strip().lower()
                         for t in json.loads(open('technicalterms.json').read())})
    for pack in packs:
        words.add(pack)
    
    return words

In [None]:
words = make_vocabulary()

In [None]:
import tqdm.notebook
for word in tqdm.notebook.tqdm_notebook(words):
    atb.insert(word, True)

In [None]:
atb.dijkstra_segment("squashedtogetherwords")

# Make a Classifier 

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
label = LabelEncoder()

df['cat'] = label.fit_transform(df['Category'])
df['clean'] = [" ".join(thing) for thing in text_data]

df.head()

In [None]:
text = df['clean'].values
target = df['cat'].values

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500,
)
word_vectorizer.fit(text)

In [None]:
features = word_vectorizer.transform(text)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, random_state=5, test_size=0.2)

In [None]:
model = OneVsRestClassifier(KNeighborsClassifier())
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(features)

In [None]:
print(f'  Training Accuracy: {model.score(x_train, y_train):.2%}')
print(f'Validation Accuracy: {model.score(x_test, y_test):.2%}')

# ...And a GUI because why not

In [None]:
import ipywidgets as wid

In [None]:
from IPython.display import display

paste_area = wid.Textarea(layout={'height': '300px', 'width':'75%'})
butt = wid.Button(description = 'Classify')
out = wid.Output(layout={'height':'100px',
                         'width':'75%',
                         'margin': '50px auto',
                         'border':'1px solid darkgrey'})

@out.capture()
def click(context) -> None:
    text = ' '.join(prep_text(paste_area.value))
    text_features = word_vectorizer.transform([text])
    result = model.predict(text_features)
    print("Resume classification:", end=' ')
    print(label.inverse_transform(result)[0])
        

butt.on_click(click)
        
box = wid.VBox(children=[
    wid.Label('Paste Resume Text into this area and hit the button!'),
    paste_area,
    butt,
    out
], layout={'display': 'flex',
           'flex_flow':'column',
           'align_items':'center'})

box

In [None]:
doc = " ".join(get_document_words("Luke Chambers FT Resume RES-2020-00386.docx"))
doc_features = word_vectorizer.transform([doc])
result = model.predict(doc_features)

label.inverse_transform(result)[0]