In [4]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import pickle
import time
import re
import nltk
import sys
import html
import xml.sax.saxutils as saxutils
from html.parser import HTMLParser
from io import StringIO
import random
import operator

from transformers import BertTokenizer, BertModel
from transformers import DistilBertModel,DistilBertTokenizer

from scipy.spatial.distance import cosine

from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
Stem=stemmer.stem

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
lemm=wordnet_lemmatizer.lemmatize

import spacy
nlp = spacy.load("en_core_web_sm")

from nltk import ngrams

import dataframe_image as dfi
import math

from nltk.tokenize import TweetTokenizer    
tknzr = TweetTokenizer()

In [5]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [6]:
def cleaning (text):
    
    #order of lines is important
    
    text=strip_tags(text)
    #text=html.unescape(text)   # stripping or converting html entities 
    #text=saxutils.unescape(text)
    
    #convertings words that their lower and uper cases are different
    text=re.sub(" US | U\.S\. ", ' USA ', text) # before lower 
    
    #converting
    text = re.sub("“|”", '', text)  #before next lines
    text = re.sub("’|′|‘|`", "'", text)  #before next lines
    
    #removing tabs and lines
    text=re.sub('\t|\n', ' ', text)
    
    #converting lower_case
    text = text.lower() 
    
    #converting
    #text=re.sub('\$|£|€|¥|dollar|dollars|yen|yens|euros', ' money ', text)   # not euro 
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    
    #removing emoji
    text = emoji_pattern.sub(r' ', text) 

    #removing emojis and non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+|,Ä¶',' ', text)  
    
    #removeing http and https (URL)
    text = re.sub(r'(http://|https://)\S+', '', text)
    
    #removing www (URL)
    text=re.sub(r'www\.\S+', '', text)
    
    #removing targets
    text=re.sub('( |^)@\S+', '', text) 

    #removing common expressions
    '''
    text=re.sub("looking forward to|look forward to|make sure|kidding me|\
                |in my opinion|by the way,|as soon as possible|shaking my head|i don't know|I do not know|\
                |in real life|quote of the day|as far as i know|shake my head|\
                |to be honest|in other words|let me know|just kidding|hope that helps|hat tip|\
                |just like that|happy birthday|never mind|well-done|\
                |in my humble opinion|happy new year|you're welcome|you are welcome| \
                |it doesn't matter|it does not matter|i think|i wonder|do you think", ' ', text) 
    '''
    
    '''
    #convertings
    text=re.sub("can't", 'cannot', text) # before other n't 
    text=re.sub("can not ", 'cannot ', text)  
    text=re.sub("'ve",' have', text)
    text=re.sub("n't",' not', text)
    text=re.sub("'ll",' will', text)
    #text=re.sub("'d",' would', text)
    text=re.sub("'re",' are', text)
    text=re.sub("i'm",'i am', text)
    text=re.sub("&",' and ', text)
    text=re.sub(" w/ ",' with ', text)
    text=re.sub(" w/i | w/in ",' within ', text)
    text=re.sub(" w/o ",' without ', text)
    text=re.sub(" c/o ",' care of ', text)
    text=re.sub(" h/t ",' hat tip ', text)
    text=re.sub(" b/c ",' because ', text)
    text=re.sub("=",' equals to ', text)
    text=re.sub("=",' = ', text)
    text=re.sub("\+",' + ', text)
    text=re.sub("\+",' plus ', text)
    text=re.sub("united states",'usa', text)
    text=re.sub("united kingdom",'uk', text)
    text=re.sub(" the us ",' usa ', text)
    text=re.sub("start-up|start_up",'startup', text)
    text=re.sub("u\.s\.a", 'usa', text)  #try text=re.sub("u.s.a", 'usa', text) with text=substantially 
    text=re.sub("aka", 'also known as', text)     
    text=re.sub("'"," ' ", text)     
    
    text= re.sub("(\?)+", '? ',text)     
    text= re.sub("(!)+", '! ',text)     
    text= re.sub("(\.\.)+", ' ',text) 
    '''

#    text = "".join(lemmatize_sentence(text))
    
    #removing some special charachter  
#    text= re.sub("[\"\+\|\*\?\(\)\/\\\^\[\]``<>\.{}`′’‘';•«»,@:~!\=%&]+", ' ',text)  #except _ -
#    text= re.sub("[\"\“\”\+\-\|\*\?\(\)\/\\\^\[\]\.{}_`′’‘';•«,@:~!\=%&]+", ' ',text) 
    
    #removing hashtag
#    text=re.sub('#', ' ', text) 
    
    #removing numbers not attached to alphabets
    '''
    text=re.sub("(^)(\d+)?(\.)?(\d+)? ",' ',text)   #removing numer at the beginning
    text=re.sub("(\s)[0-9]?(\.)?(\d+) ",' ',text) #py6 and py9
    text= re.sub(" (\.)(\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) (\d+) (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) ", ' ',text)
    text= re.sub(" (\d+)$", ' ',text)
    '''
    #text=re.sub("\S+(\d+) ",' ',text) # alphabet+digit (attached)
    #text=re.sub(" (\d+)\S+",' ',text) # digit+alphabet (attached)
    #text=re.sub(" \S+(\d+)\S+ ",' ',text) # alphabet+digit+alphabet (attached)
    #text=re.sub("(\d+)",' ',text)  #removing any number anywhere but keeps \. for decimal numbers

    #removing space
    text=re.sub('\s+',' ',text)     
    
#    text=re.sub('(^)rt ','',text)    # if we do not want to remove stopwords

#    text= nltk.word_tokenize(text)
    #text= text.split() #sometimes

    #removing_stopwords 
    #text_without_sw = [word.lower() for word in text if word.lower() not in stopwords.words()] #very slow
#    text = [word for word in text if word not in cachedStopWords]

    #lemmatization
    #text= [ lemm(word, pos="v") for word in text]
    #text= [ lemm(word, pos="n") for word in text]
    #text= [ lemm(word, pos="a") for word in text]
    
    #stemming 
    #text = [Stem(word) for word in text]
    
#    text=' '.join(text)
#    text=re.sub("''",'''"''', text)    #since nltk.tokenize converts second " to ''"
#    text=re.sub("``",'''"''', text)   # since nltk.tokenize converts first " to " ``
    
    return text

In [7]:
def cleaningA (text):
    
    #order of lines is important
    
    text=strip_tags(text)
    #text=html.unescape(text)   # stripping or converting html entities 
    #text=saxutils.unescape(text)
    
    #convertings words that their lower and uper cases are different
    text=re.sub(" US | U\.S\. ", ' USA ', text) # before lower 
    
    #converting
    text = re.sub("“|”", '', text)  #before next lines
    text = re.sub("’|′|‘|`", "'", text)  #before next lines
    
    #removing tabs and lines
    text=re.sub('\t|\n', ' ', text)
    
    #converting lower_case
    #text = text.lower() 
    
    #converting
    #text=re.sub('\$|£|€|¥|dollar|dollars|yen|yens|euros', ' money ', text)   # not euro 
    #text=re.sub('Dollar|Dollars|Yen|Yens|Euros', ' money ', text)   # not euro 

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    
    #removing emoji
    text = emoji_pattern.sub(r' ', text) 

    #removing emojis and non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+|,Ä¶',' ', text)  
    
    #removeing http and https (URL)
    text = re.sub(r'(http://|https://)\S+', '', text)
    text = re.sub(r'(HTTP://|HTTPS://)\S+', '', text)

    #removing www (URL)
    text=re.sub(r'www\.\S+', '', text)
    text=re.sub(r'WWW\.\S+', '', text)

    #removing targets
    text=re.sub('( |^)@\S+', '', text) 

    #removing common expressions
    '''
    text=re.sub("looking forward to|look forward to|make sure|kidding me|\
                |in my opinion|by the way,|as soon as possible|shaking my head|i don't know|I do not know|\
                |in real life|quote of the day|as far as i know|shake my head|\
                |to be honest|in other words|let me know|just kidding|hope that helps|hat tip|\
                |just like that|happy birthday|never mind|well-done|\
                |in my humble opinion|happy new year|you're welcome|you are welcome| \
                |it doesn't matter|it does not matter|i think|i wonder|do you think", ' ', text)  
    '''
    '''
    #convertings
    text=re.sub("can't", 'cannot', text) # before other n't 
    text=re.sub("can not ", 'cannot ', text)  
    text=re.sub("'ve",' have', text)
    text=re.sub("n't",' not', text)
    text=re.sub("'ll",' will', text)
    #text=re.sub("'d",' would', text)
    text=re.sub("'re",' are', text)
    text=re.sub("i'm",'i am', text)
    #text=re.sub("&",' and ', text)
    text=re.sub(" w/ ",' with ', text)
    text=re.sub(" w/i | w/in ",' within ', text)
    text=re.sub(" w/o ",' without ', text)
    text=re.sub(" c/o ",' care of ', text)
    text=re.sub(" h/t ",' hat tip ', text)
    text=re.sub(" b/c ",' because ', text)
    #text=re.sub("=",' equals to ', text)
    #text=re.sub("=",' = ', text)
    #text=re.sub("\+",' + ', text)
    #text=re.sub("\+",' plus ', text)
    text=re.sub("United States",'USA', text)
    text=re.sub("United Kingdom",'UK', text)
    text=re.sub(" the US ",' USA ', text)
    text=re.sub("start-up|start_up",'startup', text)
    text=re.sub("U\.S\.A", 'USA', text)  #try text=re.sub("u.s.a", 'usa', text) with text=substantially 
    text=re.sub("aka", 'also known as', text)     
    text=re.sub("'"," ' ", text)     
    '''
    text= re.sub("(\?)+", '? ',text)     
    text= re.sub("(!)+", '! ',text)     
    text= re.sub("(\.\.)+", ' ',text)   

#    text = "".join(lemmatize_sentence(text))
    
    #removing some special charachter  
#    text= re.sub("[\"\+\|\*\?\(\)\/\\\^\[\]``<>\.{}`′’‘';•«»,@:~!\=%&]+", ' ',text)  #except _ -
#    text= re.sub("[\"\“\”\+\-\|\*\?\(\)\/\\\^\[\]\.{}_`′’‘';•«,@:~!\=%&]+", ' ',text) 
    
    #removing hashtag
#    text=re.sub('#', ' ', text)  
    
    #removing numbers not attached to alphabets
    '''
    text=re.sub("(^)(\d+)?(\.)?(\d+)? ",' ',text)   #removing numer at the beginning
    text=re.sub("(\s)[0-9]?(\.)?(\d+) ",' ',text) #py6 and py9
    text= re.sub(" (\.)(\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) (\d+) (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) (\d+) ", ' ',text)
    text= re.sub(" (\d+) ", ' ',text)
    text= re.sub(" (\d+)$", ' ',text)
    '''
    #text=re.sub("\S+(\d+) ",' ',text) # alphabet+digit (attached)
    #text=re.sub(" (\d+)\S+",' ',text) # digit+alphabet (attached)
    #text=re.sub(" \S+(\d+)\S+ ",' ',text) # alphabet+digit+alphabet (attached)
    #text=re.sub("(\d+)",' ',text)  #removing any number anywhere but keeps \. for decimal numbers

    #removing space
    text=re.sub('\s+',' ',text)     
    
    #text=re.sub('(^)RT | RT ','',text)    # if we do not want to remove stopwords

#    text= nltk.word_tokenize(text)
    #text= text.split() #sometimes

    #removing_stopwords 
    #text_without_sw = [word.lower() for word in text if word.lower() not in stopwords.words()] #very slow
#    text = [word for word in text if word not in cachedStopWords]

    #lemmatization
    #text= [ lemm(word, pos="v") for word in text]
    #text= [ lemm(word, pos="n") for word in text]
    #text= [ lemm(word, pos="a") for word in text]
    
    #stemming 
    #text = [Stem(word) for word in text]
    
#    text=' '.join(text)
#    text=re.sub("''",'''"''', text)    #since nltk.tokenize converts second " to ''"
#    text=re.sub("``",'''"''', text)   # since nltk.tokenize converts first " to " ``
    
    return text

In [8]:
#df_ent = pd.read_csv('/archives1/Datasets/TweetsWorld/ent_tweets_world.csv', delimiter='\t', na_values=".",error_bad_lines=False)#,warn_bad_lines=False)
df_ent0 = pd.read_csv('ent_tweets_world.csv', delimiter='\t', na_values=".",error_bad_lines=False)#,warn_bad_lines=False)
print(df_ent0.shape)
print(df_ent0.columns)
#print(df_ent0.head()) # Preview the first 5 lines of the loaded data 

rows_ent=list(df_ent0[['user_id', 'tweet','tweet_created_at']].itertuples(index=False, name=None)) #rows_ent0
#rows_ent= list(zip(df_ent0.user_id, df_ent.tweet))
#rows_ent=df_ent0[['user_id','tweet']].apply(tuple, axis=1) 
#df_ent = df_ent0[['user_id', 'tweet','tweet_created_at']]   

del df_ent0

print("Number of tweets in ent:",len(rows_ent))  #rows_ent0
print('Memory size of ent:',sys.getsizeof(rows_ent)) #rows_ent0

(47604376, 4)
Index(['user_id', 'tweet', 'tweet_created_at', 'location_profile'], dtype='object')
Number of tweets in ent: 47604376
Memory size of ent: 402267520


In [9]:
df_mng0 = pd.read_csv('mng_tweets_world.csv', delimiter='\t', na_values=".",error_bad_lines=False,warn_bad_lines=False)
#df_mng0 = pd.read_csv('/archives1/Datasets/TweetsWorld/mng_tweets_world.csv', delimiter='\t', na_values=".",error_bad_lines=False,warn_bad_lines=False)
print(df_mng0.shape)
print(df_mng0.columns)
#print(df_mng0.head()) # Preview the first 5 lines of the loaded data 

rows_mng=list(df_mng0[['user_id', 'tweet','tweet_created_at']].itertuples(index=False, name=None)) #rows_mng0
#rows_mng= list(zip(df_mng0.user_id, df_mng.tweet))
#rows_mng=df_mng0[['user_id','tweet']].apply(tuple, axis=1) 
#df_mng = df_mng0[['user_id', 'tweet','tweet_created_at']]

del df_mng0

print(len(rows_mng)) #rows_mng0
print('memry size of mng:', sys.getsizeof(rows_mng)) #rows_mng0

(46502302, 4)
Index(['user_id', 'tweet', 'tweet_created_at', 'location_profile'], dtype='object')
46502302
memry size of mng: 402267520


In [10]:
df_public = pd.read_csv('public_tweets_world.csv', delimiter='\t', na_values=".",error_bad_lines=False)#,warn_bad_lines=False)
#df_public = pd.read_csv('/archives1/Datasets/TweetsWorld/public_tweets_world.csv', delimiter='\t', na_values=".",error_bad_lines=False)#,warn_bad_lines=False)

print(df_public.shape)
print(df_public.columns)
#print(df_public.head()) # Preview the first 5 lines of the loaded data 

rows_public00=list(df_public[['user_id', 'tweet', 'tweet_created_at']].itertuples(index=False, name=None))
#rows_public= list(zip(df_public.user_id, df_public.tweet))
#rows_public=df_public[['user_id','tweet']].apply(tuple, axis=1) 
del df_public
print(len(rows_public00))
print('memory size of public:', sys.getsizeof(rows_public00))

(72182875, 4)
Index(['user_id', 'tweet', 'tweet_created_at', 'location_profile'], dtype='object')
72182875
memory size of public: 644355008


In [11]:
rows_public=[]
for i in rows_public00:
    if ('2021' not in i[2] ) and ('2020-12' not in i[2]) and ('2020-11' not in i[2]) and ('2020-10' not in i[2]):
        rows_public.append(i)

        
print(len(rows_public))
del rows_public00


print('memory size of public:',sys.getsizeof(rows_public))

53069041
memory size of public: 477085840


In [12]:
#tweet=' i lime the book and, book. It is good. #book '
#sentences=nltk.sent_tokenize(tweet)

#sentences

#ddd=tknzr.tokenize(sentences[1])

In [13]:
'''
N=3
for n in range(N-1):
    output = list(ngrams(ddd, n+2))
    print('------')
    print(output)
    for xx in output:
        res=' '.join(xx)
        print(res)
'''
pass

In [14]:
#rows_ent[0], cleaning(rows_ent[0][1]),cleaning(rows_ent[1][1]),cleaning(rows_ent[2][1])

In [15]:
with open("ent_filtering.txt", "rb") as fp:   
    ent_filter = pickle.load(fp)

In [16]:
N=10 # number of grams
year='2014'
ontology_ent=dict()

t0=time.time()
n_e=0
for i, tweet_inf in enumerate(rows_ent):
    if year in tweet_inf[2]:
        if str(tweet_inf[0]) not in ent_filter:
        
            text=tweet_inf[1]
            tweet=cleaning(text)
            #print(tweet)
            #print('----')
            if (tweet !='') and  (tweet!=' ') and (tweet !=[]):
                n_e +=1
            
            all_sentences=nltk.sent_tokenize(tweet)
            for sentence in all_sentences:
                sentence= re.sub("[\"\“\”\.\+\-\|\*\?\(\)\/\\\^\[\]{}`′’‘;•«,@:~!\=%&]+", ' ',sentence) 
                #print(sentence)
                #print('+++++++++')
                words=tknzr.tokenize(sentence)
                #words=sentence.split()
                for word in words:
                    #ontology_words.setdefault(word, []).append(i)
                
                    if word in ontology_ent:
                        ontology_ent[word].add(i)
                    else:
                        ontology_ent[word]={i}
                
                for n in range(N-1):
                    outputs = list(ngrams(words, n+2))
                
                    for out in outputs:
                        #print(out)
                        result=' '.join(out)
                        #print(result)
                        #print('-----------')
                        #ontology_ngram.setdefault(result, []).append(i)
                    
                        if result in ontology_ent:
                            ontology_ent[result].add(i)
                        else:
                            ontology_ent[result]={i}
                    
print(time.time()-t0)                                

621.1797671318054


In [17]:
n_e

867005

In [18]:
len(ontology_ent)

36894844

In [19]:
'''
ontology_ent_new=ontology_ent

ent_1=[]
for key ,value in ontology_ent_new.items():
    if len(value)==1:
        ent_1.append(key) 

for key in ent_1:
    del ontology_ent_new[key]

len_ent=len(ontology_ent_new)
len_ent
'''
pass

In [20]:
ent_density=dict(sorted(ontology_ent.items(), key=lambda x: len(x[1]), reverse=True))
ent_density=dict((k, len(v)/n_e) for k,v in ent_density.items())

In [21]:
with open("mng_filtering.txt", "rb") as fp:   
    mng_filter = pickle.load(fp)

In [22]:
N=10 # number of grams
year='2014'
ontology_mng=dict()

t0=time.time()
n_m=0
for i, tweet_inf in enumerate(rows_mng):
    #print(tweet_inf[0])
    if year in tweet_inf[2]:
        if str(tweet_inf[0]) not in mng_filter:
        
            text=tweet_inf[1]
            tweet=cleaning(text)
      
            if (tweet !='') and  (tweet!=' ') and (tweet !=[]):
                n_m +=1
            
            all_sentences=nltk.sent_tokenize(tweet)
            for sentence in all_sentences:
                sentence= re.sub("[\"\“\”\.\+\-\|\*\?\(\)\/\\\^\[\]{}`′’‘;•«,@:~!\=%&]+", ' ',sentence) 
                words=tknzr.tokenize(sentence)
                #words=sentence.split()
                for word in words:
                    #ontology_words.setdefault(word, []).append(i)
                
                    if word in ontology_mng:
                        ontology_mng[word].add(i)
                    else:
                        ontology_mng[word]={i}
                
                for n in range(N-1):
                    outputs = list(ngrams(words, n+2))
                
                    for out in outputs:
                        result=' '.join(out)
                        #ontology_ngram.setdefault(result, []).append(i)
                    
                        if result in ontology_mng:
                            ontology_mng[result].add(i)
                        else:
                            ontology_mng[result]={i}
                    
print(time.time()-t0)                                

980.6644105911255


In [23]:
n_m

806873

In [24]:
len(ontology_mng)

36534335

In [25]:
'''
ontology_mng_new=ontology_mng

mng_1=[]
for key ,value in ontology_mng_new.items():
    if len(value)==1:
        mng_1.append(key) 

for key in mng_1:
    del ontology_mng_new[key]

len_mng=len(ontology_mng_new)
len_mng
'''
pass

In [26]:
mng_density=dict(sorted(ontology_mng.items(), key=lambda x: len(x[1]), reverse=True))
mng_density=dict((k, len(v)/n_m) for k,v in mng_density.items())

In [27]:
with open("public_filtering.txt", "rb") as fp:   
    public_filter = pickle.load(fp)

In [28]:
N=10 # number of grams
year='2014'
ontology_public=dict()

t0=time.time()
n_p=0
for i, tweet_inf in enumerate(rows_public):
    if year in tweet_inf[2]:
        if str(tweet_inf[0]) not in public_filter:

            text=tweet_inf[1]
            tweet=cleaning(text)
        
            if (tweet !='') and  (tweet!=' ') and (tweet !=[]):
                n_p +=1
        
            all_sentences=nltk.sent_tokenize(tweet)
            for sentence in all_sentences:
                sentence= re.sub("[\"\“\”\.\+\-\|\*\?\(\)\/\\\^\[\]{}`′’‘;•«,@:~!\=%&]+", ' ',sentence) 
                words=tknzr.tokenize(sentence)
                #words=sentence.split()
                for word in words:
                    #ontology_words.setdefault(word, []).append(i)
                
                    if word in ontology_public:
                        ontology_public[word].add(i)
                    else:
                        ontology_public[word]={i}
                
                for n in range(N-1):
                    outputs = list(ngrams(words, n+2))
                
                    for out in outputs:
                        result=' '.join(out)
                        #ontology_ngram.setdefault(result, []).append(i)
                    
                        if result in ontology_public:
                            ontology_public[result].add(i)
                        else:
                            ontology_public[result]={i}
                    
print(time.time()-t0)                                

996.3038105964661


In [29]:
n_p

363278

In [30]:
len(ontology_public)

13455921

In [31]:
'''
ontology_public_new=ontology_public

public_1=[]
for key ,value in ontology_public_new.items():
    if len(value)==1:
        public_1.append(key) 

for key in public_1:
    del ontology_public_new[key]

len_public=len(ontology_public_new)
len_public
'''
pass

In [32]:
public_density=dict(sorted(ontology_public.items(), key=lambda x: len(x[1]), reverse=True))
public_density=dict((k, len(v)/n_p) for k,v in public_density.items())

In [33]:
epsilon_p = 0.5/n_p
ent_public_ratio=dict()
for key, value in ent_density.items():
    if key in public_density:
        ent_public_ratio[key]= math.log10(value/public_density[key])
    else:
        ent_public_ratio[key]= math.log10(value/epsilon_p)

In [34]:
ent_public_ratio=dict(sorted(ent_public_ratio.items(), key=lambda x: x[1], reverse=True))
ent_public_ratio_list=sorted(ent_public_ratio.items(), key=lambda x: x[1], reverse=True)

In [35]:
ent_public_ratio_list[0:20]

[('for gemini', 3.579345693549572),
 ('more for gemini', 3.579345693549572),
 ('for pisces', 3.5001644475019473),
 ('more for pisces', 3.5001644475019473),
 ('for scorpio', 3.2609063825628826),
 ('more for scorpio', 3.2609063825628826),
 ('#wearethepeople', 3.2031424715483783),
 ('#tlt', 3.107938922354339),
 ('#kyadillikyalahore', 2.7769457033129146),
 ('#x0024', 2.770202816556564),
 ('inceptive', 2.752551264367765),
 ('inceptive solutions', 2.7506167645905655),
 ('bubblews', 2.7334800095318244),
 ('#irishbizparty', 2.732807206172008),
 ('bubblews via', 2.726704607185154),
 ('added this', 2.7099989136823015),
 ('added this to', 2.7099989136823015),
 ('closet on', 2.7092887017792946),
 ('just added this', 2.7092887017792946),
 ('just added this to', 2.7092887017792946)]

In [36]:
epsilon_p = 0.5/n_p
mng_public_ratio=dict()
for key, value in mng_density.items():
    if key in public_density:
        mng_public_ratio[key]= math.log10(value/public_density[key])
    else:
        mng_public_ratio[key]= math.log10(value/epsilon_p)

In [37]:
mng_public_ratio=dict(sorted(mng_public_ratio.items(), key=lambda x: x[1], reverse=True))
mng_public_ratio_list=sorted(mng_public_ratio.items(), key=lambda x: x[1], reverse=True)

In [38]:
mng_public_ratio_list[0:20]

[('for scorpio', 3.405020919881844),
 ('more for scorpio', 3.404866996618881),
 ('chick on', 3.0640424573679015),
 ('her username is', 3.0633670381308282),
 ('kik wants', 3.0630289341963493),
 ('chick on kik', 3.0630289341963493),
 ('on kik wants', 3.0630289341963493),
 ('kik wants to', 3.0630289341963493),
 ('wants to chat', 3.0630289341963493),
 ('you her username', 3.0630289341963493),
 ('chick on kik wants', 3.0630289341963493),
 ('on kik wants to', 3.0630289341963493),
 ('kik wants to chat', 3.0630289341963493),
 ('wants to chat with', 3.0630289341963493),
 ('chat with you her', 3.0630289341963493),
 ('with you her username', 3.0630289341963493),
 ('you her username is', 3.0630289341963493),
 ('chick on kik wants to', 3.0630289341963493),
 ('on kik wants to chat', 3.0630289341963493),
 ('kik wants to chat with', 3.0630289341963493)]

In [39]:
epsilon_m = 0.5/n_p
ent_mng_ratio=dict()
for key, value in ent_density.items():
    if key in mng_density:
        ent_mng_ratio[key]= math.log10(value/mng_density[key])
    else:
        ent_mng_ratio[key]= math.log10(value/epsilon_p)

In [40]:
ent_mng_ratio=dict(sorted(ent_mng_ratio.items(), key=lambda x: x[1], reverse=True))
ent_mng_ratio_list=sorted(ent_mng_ratio.items(), key=lambda x: x[1], reverse=True)

In [41]:
ent_mng_ratio_list[0:20]

[('#wearethepeople', 3.2031424715483783),
 ('#tlt', 3.107938922354339),
 ('#irishbizparty', 2.7783432957084933),
 ('#kyadillikyalahore', 2.7769457033129146),
 ('#x0024', 2.770202816556564),
 ('closet on', 2.7548247913157797),
 ('inceptive', 2.752551264367765),
 ('inceptive solutions', 2.7506167645905655),
 ('poshmark', 2.7085773265475073),
 ('to my closet', 2.7085773265475073),
 ('my closet on', 2.7085773265475073),
 ('on poshmark', 2.7078647841696157),
 ('closet on poshmark', 2.7078647841696157),
 ('i just added this', 2.7078647841696157),
 ('this to my closet', 2.7078647841696157),
 ('to my closet on', 2.7078647841696157),
 ('my closet on poshmark', 2.7078647841696157),
 ('i just added this to', 2.7078647841696157),
 ('added this to my closet', 2.7078647841696157),
 ('this to my closet on', 2.7078647841696157)]

In [42]:
epsilon_e = 0.5/n_p
mng_ent_ratio=dict()
for key, value in mng_density.items():
    if key in ent_density:
        mng_ent_ratio[key]= math.log10(value/ent_density[key])
    else:
        mng_ent_ratio[key]= math.log10(value/epsilon_p)

In [43]:
mng_ent_ratio=dict(sorted(mng_ent_ratio.items(), key=lambda x: x[1], reverse=True))
mng_ent_ratio_list=sorted(mng_ent_ratio.items(), key=lambda x: x[1], reverse=True)

In [44]:
mng_ent_ratio_list[0:20]

[('her username', 3.140119546594088),
 ('her username is', 3.140119546594088),
 ('kik wants', 3.139781442659609),
 ('chick on kik', 3.139781442659609),
 ('on kik wants', 3.139781442659609),
 ('kik wants to', 3.139781442659609),
 ('with you her', 3.139781442659609),
 ('you her username', 3.139781442659609),
 ('chick on kik wants', 3.139781442659609),
 ('on kik wants to', 3.139781442659609),
 ('kik wants to chat', 3.139781442659609),
 ('chat with you her', 3.139781442659609),
 ('with you her username', 3.139781442659609),
 ('you her username is', 3.139781442659609),
 ('chick on kik wants to', 3.139781442659609),
 ('on kik wants to chat', 3.139781442659609),
 ('kik wants to chat with', 3.139781442659609),
 ('wants to chat with you', 3.139781442659609),
 ('to chat with you her', 3.139781442659609),
 ('chat with you her username', 3.139781442659609)]