# Confirm generic conceptual models using Social media/Academic data

In [None]:
"""Necessary import libraries"""

import os
import time
import csv
import json
import re
import pandas as pd
from itertools import groupby
import networkx as nx
from networkx.algorithms import approximation as approx
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim import corpora, models
import gensim
from google.cloud import language 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
"""Global variables"""

debug=False #use this code to debug as per requiremnet(if debug==True: print('DESIRED DEBUG VALUE'))
n_topics = 50
n_words = 50
max_df = 9
#input data files
tweetDataset= './data/lemma_pdf_text.json'
mapFile='./maps/maps5.csv'

In [None]:
"""To read tweets/sentences from JSON file"""

tweets = []

with open(tweetDataset, 'r', encoding='utf-8-sig') as filehandle:  
    tweets = json.load(filehandle)

# Input graph

In [None]:
"""Create a Directed usermap from a file"""

usermap = nx.DiGraph()
f1 = csv.reader(open(mapFile,"r"))
for row in f1:
    usermap.add_edge(row[0],row[1]) #, weight = row[2])

In [None]:
"""To view or save the Directed usermap created"""
# can change size of plot for bigger maps display
plt.rcParams['figure.figsize'] = [10, 7] # [width, height] of plot
# To visualize the graph
nx.draw_kamada_kawai(usermap, with_labels=True, font_size=15, node_color='lightblue', node_size=1000)
plt.savefig("./usermap.png")

![title](./img/pic1.png)

# Derivationally related forms

In [None]:
"""To fetch derivationally related forms of the nodes of usermap"""

der_rel_form = {}
for node in usermap.nodes():                                            #for each node of the concept map
    der_rel_form_values = set()                                         #set of list of derivationally related forms
    der_rel_form_values.add(node)            
    phrases = word_tokenize(node)                                    #for node consisting of more than one word
    for token in phrases:
        der_rel_form_values.add(token)
        for each_synsets in wn.synsets(token):                          #Look up a word using synsets()
            for each_lemma in each_synsets.lemmas():                    #Each synset contains one or more lemmas, which represent a specific sense of a specific word
                for each in each_lemma.derivationally_related_forms():  #derivationally_related_forms relation is defined by WordNet only over Lemmas
                    der_rel_form_values.add(each.name().lower())
    der_rel_form[node] = der_rel_form_values

if debug==True:
    with open('./der_rel_form.txt', 'w', encoding='utf8') as outfile:
        for key in der_rel_form:
            outfile.write(key+':'+str(der_rel_form[key]))

![title](./img/pic2.png)

# Retrieve non emtpy relevant tweets/sentences and find out the empty nodes removed

In [None]:
"""To retrieve relevant tweets/sentences using derivationally related forms of the nodes of usermap"""

relevantTweets = {}
for k,v in der_rel_form.items():
    relevantTweets_values = set()                           #set of list of relevant tweets
    for keywords in v:                                      #lookup for keywords in those lists inside the dictionary iteratively
        for tweet in tweets:                                #lookup for single tweet inside the input 'tweets'
            if (' ' + keywords + ' ') in (' ' + tweet + ' '):
                relevantTweets_values.add(tweet)            #add relevant tweet according to the presence of keywords among them
    relevantTweets[k] = relevantTweets_values

nonEmtyRelevantTweets = dict((k, v) for k, v in relevantTweets.items() if v) #removes keys with empty key-value

if debug==True:
    with open('./nonEmtyRelevantTweets.txt', 'w', encoding='utf8') as outfile:
        for key in nonEmtyRelevantTweets:
            outfile.write(key+':'+str(nonEmtyRelevantTweets[key]))

# To view the empty nodes removed, if any

In [None]:
nodesremoved = set()
for k,v in relevantTweets.items():                          #to print the empty nodes
    if not v:
        nodesremoved.add(k)
        
print("nodes removed : " + str(nodesremoved))

# Retrieve total number of relevant tweets/sentences for given user data

In [None]:
"""To retrieve number of relevant tweets/sentences"""

unique = set()
for v in nonEmtyRelevantTweets.values():
    for i in v:
        unique.add(i)
        
print(len(unique))

# Retrieve number of relevant tweets corresponding to each node of user map

In [None]:
"""to retrieve the number of relevant tweets/sentences collected per node of map"""

key_to_value_lengths = {k:[len(v)] for k, v in nonEmtyRelevantTweets.items()}

if debug==True:
    with open('./key_to_value_lengths.txt', 'w', encoding='utf8') as outfile:
        for key in key_to_value_lengths:
            outfile.write(key+':'+str(key_to_value_lengths[key]))

![title](./img/pic3.png)

# Run LDA multicore model to fetch themes

In [None]:
"""to fetch relevant themes""" 
themes = {}
for k,v in nonEmtyRelevantTweets.items():  #for each key,value of the hashmap 'nonEmtyRelevantTweets'
    themes_values = set()                  #set of list of relevant themes
    texts = [[word for word in document.split()] for document in v] #splits sentences into words
    #traverses texts, assigning a unique integer id to each unique token while also collecting word counts and relevant statistics
    dictionary = corpora.Dictionary(texts)
    #convert dictionary into a bag-of-words
    corpus = [dictionary.doc2bow(text) for text in texts]  #corpus created is a list of vectors equal to the number of documents. 
    #In each document vector is a series of tuples(termid,term freq)
    lda = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=n_topics,
                                     chunksize=10000, passes=20, eval_every = None, workers=1,
                                     dtype=np.float64, iterations = 400)                                                                    
    #set of relative important words in each topic corresponding to its keyName    
    for i in range(0, n_topics):            #lda.num_topics - to view all topics generated by ldamodel
        for word, prob in lda.show_topic(i,n_words):
            themes_values.add(format(word))
            themes[k] = themes_values       #creates a dictionary 'themes' where key=nodes of graph, value='themes_values'

if debug==True:
    with open('./ldathemes.txt', 'w', encoding='utf8') as outfile:
        for key in themes:                  #to print the nodes and their corresponding values
            outfile.write(key+':'+str(themes[key]))

# Clean themes already present in der_rel_form of the nodes

In [None]:
"""to remove themes that are already present in the synonyms of key node

eg: synonyms: Obesity = corpulency, fleshiness, obesity
      themes: Obesity = food, health, heart, fleshiness, eat, obesity
   newthemes: Obesity = food, health, heart, eat
   """ 
clean_themes = {}
for k1 in themes.keys():
    newtheme_values = set()
    for k2 in der_rel_form.keys():
        if(k1 == k2):
            newtheme_values = (themes[k1].difference(der_rel_form[k1]))
    clean_themes[k1] = newtheme_values

if debug==True:
    with open('./clean_themes.txt', 'w', encoding='utf8') as outfile: 
        for key in clean_themes: 
            outfile.write(key+':'+str(clean_themes[key]))

![title](./img/pic4.png)

# Remove non-entities from the cleaned themes

In [None]:
"""to get rid of non entities from the themes fetched""" 

#Create an Environment Variable[GOOGLE_APPLICATION_CREDENTIALS] which stores the API key value (used for authentication when making the API call)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "./apikey.json"
client = language.LanguageServiceClient()
#Now you can make the entity recognition call -
new = {}
for k1 in clean_themes.keys():
    old = set()
    for keywords in clean_themes[k1]:
        document = language.types.Document(
        content=keywords,
        language='en',
        type='PLAIN_TEXT'
        )
        #To make the API call 
        response = client.analyze_entities(document=document, encoding_type='UTF32')    
        for entity in response.entities:
            #entity.name fetches the recognized entities
            old.add(entity.name)
    new[k1] = old
    time.sleep(100)

if debug==True:
    with open('./entity.txt', 'w', encoding='utf8') as outfile:
        for key in new:
            outfile.write(key+':'+str(new[key]))

# Absolute frequency of themes related keywords for each node of user map

In [None]:
"""to fetch ABSOLUTE FREQUENCY of theme related keywords from their corresponding relevant tweets set""" 

freq={}
word = {}
for k1 in clean_themes.keys():     
    for k2 in nonEmtyRelevantTweets.keys():        
        if(k1 == k2):
            tweet_freq = {}
            for keywords in clean_themes[k1]:                
                count = 0
                for tweet in nonEmtyRelevantTweets[k1]:                    
                    if (' ' + keywords + ' ') in (' ' + tweet + ' '):
                        count += 1
                        tweet_freq[keywords] = count        
        freq[k1] = tweet_freq

if debug==True:
    with open('./absolutefreq.txt', 'w', encoding='utf8') as outfile:
        for key in freq:
            outfile.write(key+':'+str(freq[key]))   

# Most relevant themes related keywords using TF-IDF weight values

In [None]:
"""to fetch TF-IDF of most occuring keywords from relevant Tweets""" 

tweet_tfidf = {}
for k1 in clean_themes.keys():
    for k2 in nonEmtyRelevantTweets.keys():
        if(k1 == k2):
            tweet_weights = {}
            cv = CountVectorizer()
            # convert text data into term-frequency matrix
            data = cv.fit_transform(nonEmtyRelevantTweets[k1])
            tfidf_transformer = TfidfTransformer()
            # convert term-frequency matrix into tf-idf
            tfidf_matrix = tfidf_transformer.fit_transform(data)
            # create dictionary to find a tfidf word each word
            word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))
            sorted_words = sorted(word2tfidf.items(), key=lambda x: x[1], reverse=False)
            for word, score in sorted_words:
                if word in clean_themes[k1]:
                    if (score < max_df):
                        tweet_weights[word] = (format(score))
    tweet_tfidf[k1] = tweet_weights

if debug==True:
    with open('./tfidf.txt', 'w', encoding='utf8') as outfile:                    
        for key in tweet_tfidf:                      
            outfile.write(key+':'+str(tweet_tfidf[key]))

# Confirm edges

In [None]:
"""to retrive final edges from comparison between TF-IDF and der_rel_form keywords"""

final_edges = {}
for k1, v1 in tweet_tfidf.items():
    associated_edges = []
    for v2 in v1.keys():                    #fetch the identified themes using tf-idf for those nodes
        for k3, v3 in der_rel_form.items(): #fetch the corresponding synonyms of those nodes
            if v2 in v3:                    #if themes 'v2' in 'synonyms' values
                associated_edges.append(k3)
    final_edges[k1] = associated_edges

In [None]:
"""associated map"""

associated_map=nx.DiGraph()
for key in final_edges.keys(): 
    for z in range(0,len(final_edges[key])):
        associated_map.add_edges_from([(str(key),str(final_edges[key][z]))])

In [None]:
"""confirmed map"""
# can change size of plot for bigger maps display
plt.rcParams['figure.figsize'] = [10, 7] # [width, height] of plot

csv_filename = './confirmed_map.csv'

for edge1 in usermap.edges():
    usermap[edge1[0]][edge1[1]]['color'] = 'red'
    
count = 0

with open(csv_filename, 'w') as f:
    wtr = csv.writer(f, delimiter=',', lineterminator='\n')
    for edge1 in usermap.edges():
        for edge2 in associated_map.edges():
            if(edge1==edge2):
                count += 1
                wtr.writerow([edge1[0],edge1[1]])
                usermap[edge2[0]][edge2[1]]['color'] = 'darkgreen'
                edge_color_list = [ usermap[edge2[0]][edge2[1]]['color'] for edge2 in usermap.edges() ]
f.close()            
print(count)
nx.draw_kamada_kawai(usermap, with_labels=True, font_size=15, node_color='lightblue',
                     edge_color = edge_color_list, node_size=1000)
plt.savefig("./confirmedmap.png")

# Adjacency matrix of confirmed map

In [None]:
#confirmed edges
edge = list(set())
for edge1 in usermap.edges():
    edge.append([edge1[0],edge1[1]])
    for edge2 in associated_map.edges():
        if(edge1==edge2):
            edge.append([edge2[0],edge2[1]])

In [None]:
df = pd.DataFrame(edge)                                     #Create a datafarme
df = pd.crosstab(df[0], df[1])                              #Compute a simple cross-tabulation of two (or more) factors
# this step is to get the nodes with no relation as well
idx = df.columns.union(df.index)                            #then reindex by union of column and index values 
df = df.reindex(index = idx, columns=idx, fill_value=0)

In [None]:
"""save the adjacency matrix in a csv

   0 = no relation, 1 = not confirmed edges, 2 = confirmed edges
   From = rows, To = columns
"""

with open('./adjacencymatrix.csv', 'w') as fi:
    df.to_csv(fi, header=True)
fi.close()   

# Extend edges

In [None]:
# find association rules using PySpark 2.4.0 FP growth mlib with lift values
# for code refer: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.fpm

In [None]:
import arules1  #convert rules into dictionary of antecedant and consequent and use it as a .py file to import here
arules = arules1.arules #dictionary arules = {'antecedeant1':{'consequent1','consequent2'},'antecedeant2':{'consequent3'}}

In [None]:
def mapToRoot(word):
    #this function takes a word e.g. 'obese' and maps it back to its root form e.g. obesity
    for key in der_rel_form: #for each root form
        if word in der_rel_form[key]:#if our input word is found within variations of this root form
            return key#then we have the root form we wanted
    
#Given an association rule such as arules = {'obese':{'eat','strain','bias'},'dieting':{'slump'},'corona':{'feed','fleshy'},'depress':{'eater'}}
#We INDEPENDENTLY MAP each of the words back to their root form
def mapAssociationRules():
    resultMap = {}
    for key in arules:
        resultMap[mapToRoot(key)]=list(set()) #create the entry, mapped
        for consequent in arules[key]:
            resultMap[mapToRoot(key)].append(mapToRoot(consequent)) #to the set we add the root form
    return resultMap

In [None]:
"""arules confirmed/extended graph"""

#removes keys with 'None' value
arules_edges = dict((k, v) for k, v in mapAssociationRules().items() if k != None)
#removes Value with 'None' value
for k,v in arules_edges.items():
    arules_val = []
    for x in v:
        if x != None:
            arules_val.append(x)
    arules_edges[k] = arules_val

#create graph using arules_edges
arules_map=nx.DiGraph()
for key in arules_edges.keys(): 
    for z in range(0,len(arules_edges[key])):
        arules_map.add_edges_from([(str(key),str(arules_edges[key][z]))])
print(len(arules_map.edges()),('confirmed/extended edges using arules'))

In [None]:
"""extended map"""

#find number of extended edges
c=0
for edge1 in arules_map.edges():
    for edge2 in usermap.edges():
        if(edge1==edge2):
            c+=1
print((len(arules_map.edges())-c),'extended edges using association rules')

#find difference between usermap and arules_map to get extended edges
arules_map_edges = set(arules_map.edges())
usermap_edges = set(usermap.edges())
extended_edges = list(arules_map_edges - usermap_edges)
print(extended_edges)

#create graph using extended_edges
extended_map=nx.DiGraph()
for edge in extended_edges: 
    extended_map.add_edges_from([edge])
        
#store the edges in a csv file
csv_filename = './extended_map.csv'
with open(csv_filename, 'w') as f:
    wtr = csv.writer(f, delimiter=',', lineterminator='\n')
    for edge1 in extended_map.edges():
        wtr.writerow([edge1[0],edge1[1]])
f.close()  