In [298]:
from py2neo import Graph, Node, Relationship
import json
import Levenshtein

In [299]:
g = Graph(password="1234")

In [300]:
# get company name 
company_name_list = g.run("MATCH (c:Company) RETURN c.name as name").data()

In [301]:
# load naics related data
f = open("./2012_NAICS_Structure.csv", "r")
content = f.read().split("\n")
with open('naics.json') as f:
    naics_data = json.load(f)

In [302]:
#get naics code book as a dictionary
naics_code_book = {}
for line in content:
    temp_code = line.split(",")[0]
    temp_desc = line.split(",")[1]
    naics_code_book[temp_code] = temp_desc  

In [303]:
BELONGS_TO = Relationship.type("SubClassOf")

In [304]:


def getKey(item):
    return item[0]

for obj in naics_data:
    name = obj['name'].split(" ")[0]
    l = [(Levenshtein.distance(item['name'].split(" ")[0],name),item) for item in company_name_list]
    l = sorted(l, key=getKey)
    company_node = g.nodes.match('Company', name=l[0][1]['name']).first()
    
    prev = company_node
    cur_code = obj['code']
    while len(cur_code) > 0:
        if cur_code in naics_code_book:
            info = naics_code_book[cur_code]
            info = info.replace("\"","").strip()
            if info[-1] == "T":
                info = info[0:-1]
            temp = Node("NAICS_CODE", code=cur_code, description=info)
            g.merge(temp, "NAICS_CODE", "code")
            temp_r = BELONGS_TO(prev, temp)
            g.create(temp_r)
            prev = temp
        cur_code = cur_code[0:-1]

In [305]:
with open('person-birth.json') as f:
    person_data = json.load(f)

In [306]:
USBORN = Relationship.type("USAborn")

In [307]:
person_data


[{'name': 'Mark Zuckerberg', 'location': 'New York', 'birth-year': '1984'},
 {'name': 'Eduardo Saverin', 'location': '', 'birth-year': '1982'},
 {'name': 'Andrew McCollum', 'location': 'California', 'birth-year': '1983'},
 {'name': 'Dustin Moskovitz', 'location': 'Florida', 'birth-year': '1984'},
 {'name': 'Chris Hughes', 'location': 'North Carolina', 'birth-year': '1983'},
 {'name': 'Bill Gates', 'location': 'Washington', 'birth-year': '1955'},
 {'name': 'Paul Allen', 'location': 'Washington', 'birth-year': '1953'},
 {'name': 'Satya Nadella', 'location': '', 'birth-year': '1967'},
 {'name': 'Larry Page', 'location': 'Massachusetts', 'birth-year': '1973'},
 {'name': 'Sergey Brin', 'location': '', 'birth-year': '1973'},
 {'name': 'Sundar Pichai', 'location': '', 'birth-year': '1972'},
 {'name': 'Steve Jobs', 'location': 'California', 'birth-year': '1955'},
 {'name': 'Steve Wozniak', 'location': 'California', 'birth-year': '1950'},
 {'name': 'Ronald Wayne', 'location': 'Ohio', 'birth-yea

In [308]:
for p in person_data:
        print(p["name"])
        person_node = g.nodes.match('Person', name=p["name"]).first()
        person_node['DOB'] = int( p['birth-year'])
        #g.push(person_node)

        if len(p["location"]) > 0:
            temp = Node("State", name=p["location"])
            g.merge(temp, "State", "name")
        else:
            continue
        temp_r = USBORN(person_node, temp)
        g.create(temp_r)

        print("error", p)

Mark Zuckerberg
error {'name': 'Mark Zuckerberg', 'location': 'New York', 'birth-year': '1984'}
Eduardo Saverin
Andrew McCollum
error {'name': 'Andrew McCollum', 'location': 'California', 'birth-year': '1983'}
Dustin Moskovitz
error {'name': 'Dustin Moskovitz', 'location': 'Florida', 'birth-year': '1984'}
Chris Hughes
error {'name': 'Chris Hughes', 'location': 'North Carolina', 'birth-year': '1983'}
Bill Gates
error {'name': 'Bill Gates', 'location': 'Washington', 'birth-year': '1955'}
Paul Allen
error {'name': 'Paul Allen', 'location': 'Washington', 'birth-year': '1953'}
Satya Nadella
Larry Page
error {'name': 'Larry Page', 'location': 'Massachusetts', 'birth-year': '1973'}
Sergey Brin
Sundar Pichai
Steve Jobs
error {'name': 'Steve Jobs', 'location': 'California', 'birth-year': '1955'}
Steve Wozniak
error {'name': 'Steve Wozniak', 'location': 'California', 'birth-year': '1950'}
Ronald Wayne
error {'name': 'Ronald Wayne', 'location': 'Ohio', 'birth-year': '1934'}
Tim Cook
error {'name'

In [309]:
with open('goole-pro-text.json') as f:
    gp_data = json.load(f)

In [310]:
PRO = Relationship.type("Produce")

In [311]:
company_node = g.nodes.match('Company', name="Google").first()
for p in gp_data:
    sentence = p["text"].split(". ")[0] + "."
    temp = Node("Product", name=p["name"], desc=sentence)
    g.merge(temp, "Product", "name")
    temp_r = PRO(company_node, temp)
    g.create(temp_r)

In [312]:
with open('facebook-pro-text.json') as f:
    gp_data = json.load(f)
company_node = g.nodes.match('Company', name="Facebook").first()
for p in gp_data:
    sentence = p["text"].split(". ")[0] + "."
    temp = Node("Product", name=p["name"], desc=sentence)
    g.merge(temp, "Product", "name")
    temp_r = PRO(company_node, temp)
    g.create(temp_r)

In [313]:
with open('microsoft-pro-text.json') as f:
    gp_data = json.load(f)
company_node = g.nodes.match('Company', name="Microsoft").first()
for p in gp_data:
    sentence = p["text"].split(". ")[0] + "."
    if "may refer to" in sentence:
        continue
    temp = Node("Product", name=p["name"], desc=sentence)
    g.merge(temp, "Product", "name")
    temp_r = PRO(company_node, temp)
    g.create(temp_r)

In [314]:
with open('apple-pro-text.json') as f:
    gp_data = json.load(f)
company_node = g.nodes.match('Company', name="Apple Inc.").first()
for p in gp_data:
    sentence = p["text"].split(". ")[0] + "."
    if "may refer to" in sentence:
        continue
    temp = Node("Product", name=p["name"], desc=sentence)
    g.merge(temp, "Product", "name")
    temp_r = PRO(company_node, temp)
    g.create(temp_r)

In [315]:
with open('products.json', encoding='utf-8-sig') as f:
    pro = json.load(f)

In [316]:
out = []
for p in pro:
    out.append(p['p']['properties'])
with open('out.json', 'w', encoding='utf-8') as f:
    json.dump(out, f, ensure_ascii=False, indent=4)

In [317]:
# Basic set-ups 
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


In [318]:
data = pd.read_json("out.json")

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=1, stop_words="english") 
tfidf = vect.fit_transform(data["desc"])  
pairwise_similarity = tfidf * tfidf.T 
sorted(pairwise_similarity.toarray()[0])[0]


0.0