## Imports

In [1]:
import json
import re
from matplotlib import rcParams
rcParams['figure.figsize'] = 11.7,8.27
import nltk
from string import punctuation
import numpy as np
import networkx as nx
import spacy
from math import exp
import matplotlib.pyplot as py
%matplotlib inline


## Loading Data

In [2]:
def load_data_from_json(filename):
    with open(filename,'r') as f:
        return json.load(f)

In [3]:
master_data = load_data_from_json('dataset/master.json')

In [4]:
# master_data

In [5]:
def is_post(s):
    if len(re.findall(r'\$([a-zA-Z_]+)',s))>0:
        return True
    return False

In [6]:
#seprating post and headlines
post_sentence = []
headlines_sentence = []
for key in master_data.keys():
    if is_post(master_data[key]['sentence']):
        post_sentence.append(master_data[key]['sentence'])
    else:
        headlines_sentence.append(master_data[key]['sentence'])

In [7]:
len(post_sentence),len(headlines_sentence)

(675, 435)

In [8]:
# turn a sentence into clean tokens
def clean_sentence(sentence):
    #remove multiple repeat non num-aplha char !!!!!!!!!-->!
    sentence = re.sub(r'(\W)\1{2,}', r'\1', sentence) 
    #removes alpha char repeating more than twice aaaa->aa
    sentence = re.sub(r'(\w)\1{2,}', r'\1\1', sentence)
    #removes links
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
    # remove @usernames
    sentence = re.sub(r"\@(\w+)", "", sentence)
    #removing stock names to see if it helps
#     sentence = re.sub(r"(?:\$|https?\://)\S+", "", sentence)
    #remove # from #tags
    sentence = sentence.replace('#','')
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation.replace('$',''))
    tokens = [w.translate(table) for w in tokens]
#     remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
#no removing non alpha words to keep stock names($ZSL)
    # filter out stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens

In [9]:
post_sentence = [clean_sentence(s) for s in post_sentence]
headlines_sentence = [clean_sentence(s) for s in headlines_sentence]

In [10]:
def prepare_sentence(s):
    sentences = nltk.sent_tokenize(s)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    return chunked_sentences

In [11]:
def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))
    return entity_names

In [12]:
def get_ners(s):
    chunked_sentences = prepare_sentence(s)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    entity_names = [x.split()[0] for x in list(set(entity_names))]
    return entity_names

In [19]:
def get_index_of_targets(s,is_post_flag = True):
    if is_post_flag:
        targets = ['$'+x for x in re.findall(r'\$([a-zA-Z_]+)',s)]
        index = [i for i,j in enumerate(s.split()) if j in targets]
        return index
    else:
        targets = get_ners(s)
        index = [i for i,j in enumerate(s.split()) if j in targets]
        return index

In [14]:
def get_sentence_dependency_tree(s):
    nlp = spacy.load('en')
    document = nlp(s)

    # Load spacy's dependency tree into a networkx graph
    edges = []
    for token in document:
        for child in token.children:
            edges.append(('{0}-{1}'.format(token.lower_,token.i),
                          '{0}-{1}'.format(child.lower_,child.i)))
    graph = nx.DiGraph(edges)
    for node in graph.in_degree():
        if node[1] == 0:
            root = node[0]
            break
    nodes = graph.node()
    depth = 0
    for node in nodes:
        try:
            temp = nx.shortest_path_length(graph, source=root,target=node)
        except:
            continue
        if temp > depth:
            depth = temp
    return graph, depth


In [15]:
def get_distance_between_two_words(graph,node1,node1_index,node2,node2_index,depth):
    node1 = node1.lower().replace('$','')
    node1 = node1+'-'+str(node1_index)
    node2 = node2.lower().replace('$','')
    node2 = node2+'-'+str(node2_index)
    try:
        return nx.shortest_path_length(graph.to_undirected(), source=node1, target=node2)
    except:
        return 10*depth 

In [22]:
def get_sentence_tokens_prob(s):
    s_prob_vectors = []
    tokens = s.split()
    prob_target = np.zeros(len(tokens))
    if is_post(s):
        target_index = get_index_of_targets(s)
    else:
        target_index = get_index_of_targets(s,is_post_flag=False)
    
    if len(target_index) == 0:
        s_prob_vectors.append(np.zeros(len(tokens)))
        return s_prob_vectors
    prob_each_target = 1.0/len(target_index)
    graph,depth = get_sentence_dependency_tree(s.replace('$',''))
    for i in target_index:
        prob_target[i] = prob_each_target
    for i in range(len(prob_target)):
        if prob_target[i]!=0:
            sentence_prob = np.zeros(len(tokens))
            for j in range(len(sentence_prob)):
                if j==i:
                    sentence_prob[j]+=1+prob_target[i]
                else:
                    sentence_prob[j]+=prob_target[i]*exp(-((get_distance_between_two_words(graph,tokens[i],i,tokens[j],j,depth)**2)/(2.0*depth)))
            s_prob_vectors.append(sentence_prob)
    return s_prob_vectors

In [24]:
# count = 1
# for x in headlines_sentence:
#     print(count)
#     print(get_sentence_tokens_prob(x))
#     count+=1


In [25]:
# sentence_relation_vector

In [37]:
def renormalize_series(series):
    mean_series = np.mean(series)
    std_series = np.std(series)
    if std_series == 0:
        return [1 for x in series]
    series_normalized = [(x-mean_series)/std_series for x in series]
    return [x+1 for x in series_normalized]

In [38]:
# sentence_relation_vector = [renormalize_series(x) for x in sentence_relation_vector]
# sentence_relation_vector

In [41]:
def get_normalized_sentence_relation_vector(s):
    sentence_relation_vector = get_sentence_tokens_prob(s)
    sentence_relation_vector = [renormalize_series(x) for x in sentence_relation_vector]
    return sentence_relation_vector

In [45]:
get_normalized_sentence_relation_vector(headlines_sentence[0])

[[3.346922282374696,
  1.1551420014842846,
  0.8694954916603032,
  0.38177165509664257,
  0.5777296546377978,
  0.38177165509664257,
  0.28716725964963474],
 [0.22610852201268328,
  0.43401599639637267,
  0.7435736486934525,
  3.372073369215156,
  1.0466388185925104,
  0.7435736486934525,
  0.43401599639637267]]

In [202]:
# post_sentence

In [203]:
# re.findall(r'\$([a-zA-Z]+)','Slowly adding some $FIO here $googl but gotta be $12 careful. This will be one of biggest winners in 2012')

In [204]:
# post_sentence = []
# post_target = []
# extracted_re = []
# for key in posts_data.keys():
#     post_sentence.append(posts_data[key]['sentence'])
#     temp = []
#     for x in posts_data[key]['info']:
#         temp.append(x['target'])
#     post_target.append(temp)
#     extracted_re.append(list(set(re.findall(r'\$([a-zA-Z_]+)',posts_data[key]['sentence']))))    

In [205]:
# len(post_sentence),len(post_target),len(extracted_re)

In [206]:
# for i in enumerate(zip(target,extracted_re)):
#     print(i)

In [207]:
# for i,a in enumerate(extracted_re):
#     if len(a)>1:
#         print(sentence[i])
#         print(target[i],a)

In [208]:
# headlines_data

In [209]:
# NNP_tokens=[]
# headlines=[]
# targets_headline = []
# for key in headlines_data.keys():
#     text = nltk.word_tokenize(headlines_data[key]['sentence'])
#     print("-------------------------------------------------------")
#     print(headlines_data[key]['sentence'])
#     headlines.append(headlines_data[key]['sentence'])
#     temp = []
#     for x in headlines_data[key]['info']:
#         temp.append(x['target'])
#     print(temp)
#     targets_headline.append(temp)
#     pos_tags = nltk.pos_tag(text)
#     for x in pos_tags:
#         if x[1] == 'NNP':
#             NNP_tokens.append(x[0])
#     print(pos_tags)

In [58]:
#generating word freq  
# nnp_tokens_freq = nltk.probability.FreqDist(NNP_tokens)
# nnp_tokens_freq.pprint(len(NNP_tokens))

In [33]:
#expermenting with NER

In [59]:
# headlines = [headlines_data[key]['sentence'] for key in headlines_data.keys()]
# targets = 

In [210]:
# ners_headlines = [get_ners(s) for s in headlines]
# ners_headlines = [(nltk.pos_tag(n)) for n in ners_headlines]
# ners_headlines

In [211]:
# for i in enumerate(zip(targets_headline,ners_headlines)):
#     print(i)

In [212]:
# headlines[284]

In [213]:
# posts_data

In [214]:
# post_target

In [215]:
# ners_post = [get_ners(s) for s in post_sentence]

In [216]:
# for i in enumerate(zip(post_target,ners_post)):
#     print(i)