In [1]:
import spacy
import numpy as np
import json
from spellchecker import SpellChecker


In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
#accepts a string or token, returns a list of the top ten closest lexemes
def most_similar(word, topn=10):
    if type(word) != spacy.tokens.token.Token:
        word = nlp(word)[0]
    queries = [w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return by_similarity[:topn]

In [4]:
testcache = {}
with open("similar_cache") as f:
    testcache = eval(f.read())


In [5]:
testcache

{'conference': {'similar': {'activists',
   'activities',
   'addressed',
   'advisory',
   'afternoon',
   'allies',
   'ambassador',
   'announce',
   'announced',
   'announcement',
   'announcements',
   'announces',
   'announcing',
   'anticipated',
   'attend',
   'attendance',
   'attended',
   'attendees',
   'attending',
   'attends',
   'audience',
   'banquet',
   'booth',
   'booths',
   'brief',
   'briefing',
   'brought',
   'candidates',
   'caucus',
   'celebration',
   'chairman',
   'classes',
   'closed',
   'coalition',
   'colleagues',
   'commission',
   'committee',
   'committees',
   'concluded',
   'conference',
   'conferences',
   'confirmed',
   'congress',
   'congressional',
   'convention',
   'conventions',
   'conversation',
   'conversations',
   'cooperation',
   'council',
   'courses',
   'day',
   'debate',
   'delegate',
   'delegates',
   'delegation',
   'demonstration',
   'described',
   'diplomatic',
   'diplomats',
   'discourse',
   'dis

In [6]:
# with open('calnames.txt', encoding="utf8") as f:
#     calnames = f.read().split('\n')

In [12]:
#calnames = ["I&C SCI 6D  DIS B4 (36034)"]
def categorize(calnames):
    spell = SpellChecker()
    for i in calnames:
        doc = nlp(i)
        if i.lower().endswith("birthday"):
            continue
        possibilities = []
        keywords = set()
        likelihood = {j:0 for j in testcache.keys()}
        for token in doc:
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)
            if token.pos_ != 'PROPN' or spell.correction(token.text) == token.text:
                to_check = token.lemma_  
            else:
                to_check = spell.correction(token.text)
            if len(to_check) > 2 :
                for j in testcache.keys():
                    for k in testcache[j]["similar"]:
                        if k.startswith(to_check):
                            possibilities.append((j,k))
                            keywords.add(to_check)
        if len(possibilities) == 0:
            print(i + ": none found" )
            continue

        for token in possibilities:
            max1 = 0
            max2 = 0
            nlp_token1 = nlp(token[0])[0]
            nlp_token2 = nlp(token[1])[0]
            for j in range(len(doc)):
                sim = nlp_token1.similarity(doc[j])
                if sim > max1:
                    max1 = sim
                sim = nlp_token2.similarity(doc[j])
                if sim > max2:
                    max1 = sim
            val = (max1+max2) * nlp_token1.similarity(nlp_token2)
            if likelihood[token[0]] < val:
                likelihood[token[0]] = val
        if all([token.pos_ == "PROPN" for token in doc]) and len(doc) < 4:
            if "meeting" in likelihood:
                likelihood["meeting"] += 0.4

        for j in keywords:
            for k in testcache.keys():
                if k.startswith(j):
                    likelihood[k] += 0.4

    #     weights1 = [max(nlp(token[0])[0].similarity(doc[j]) for j in range(len(doc))) for token in possibilities]
    #     weights2 = [max(nlp(token[1])[0].similarity(doc[j]) for j in range(len(doc))) for token in possibilities]
    #     print(list(zip(possibilities,weights1,weights2)))
    #     print(i + ": " + possibilities[weights1.index(max(weights1))][0])
        top_result = sorted(likelihood,key=lambda x:-likelihood[x])[0]
        print(i + ": " + top_result + ("" if likelihood[top_result] > 0.3 else " (Unconfident)"))


In [13]:
import os
for file in os.listdir('in'):
    with open("in/"+file, encoding="utf8") as f:
        calnames = json.loads(f.read())
    calnames = list(set([i["calendar"]["labeledCalendar"][j]["event"] for i in calnames.values() for j in range(len(i["calendar"]["labeledCalendar"])) ]))
    print(file)
    print()
    categorize(calnames)

jordan

Talk at DBH5011: discussion (Unconfident)
Dokko Wedding: none found
Persistent Mneeting: meeting
Mother's Day: meeting
student demo: seminar (Unconfident)
Diabetes review: studying (Unconfident)
Ping, Jordan, Ramesh: none found
Final Exam Proctor: meeting
Monday, March 19, 1:15 PM Wife Hospital: working (Unconfident)
Kimpton Solamar Hotel에서 숙박: none found
Jessica, Amir, Jordan: none found
Meeting with Ramesh, Nik: meeting
ACM MM PAPER ABSTRACT DEADLING: none found
Class Attention for Ramesh: working (Unconfident)
Meeting with Ping Gross Hall 2034: meeting
Meeting with Kate, at Cove: meeting
조세의 날: none found
Meeting with Marek: meeting
5월: none found
Tuesday, March 20, 11:45 AM Meeting Ramesh: meeting
Lab Meeting: meeting
ACM MM PAPER SUBMISSION DEADLING: none found
Sean Munson Meeting: meeting
lab meeting: meeting
Meeting for Food Logging: meeting
Meetimg Ramesh and Kate: meeting
Writing Center : none found
Health Intelligence Class: meeting
Meeting with Tech Team: meeting
장인어

Howard and Mass : working (Unconfident)
Stay at Hilton Chicago O'Hare Airport: working (Unconfident)
Ping Wang and Ramesh: working
STCSN Editorial Board Hangouts Meeting: meeting
Health Intelligence: none found
Independence Day: meeting
Nik, Sharad, Ramesh -- just BS: working (Unconfident)
IEEE MMM: none found
Calum Macrae?: lecture (Unconfident)
Intern Meeting: meeting
Jordan for architecture: discussion (Unconfident)
Patient Navigator: Meet eLogic Team: meeting
John Wood: meeting
Groundbreaking: none found
Lunch with Cyrus Shahabi: meeting (Unconfident)
Michelle Khine +: none found
MVP Stroke: none found
Mohan - pick up from Mike Carey office: meeting
Flight to Dallas (AA 369): none found
Manish Patel, Brandify: none found
Jury duty: none found
Jeremy: none found
Father's Day: meeting
Ajay Jonathan Raj -- UG: none found
Flight to Santa Ana (WN 818): none found
Dr. Jain <> Innovacccer | Demo: studying (Unconfident)
Houston to Lima on UA 854: none found
Meet in Ramesh Office, Lisa to d