Chris Tanasescu (Margento): Networks of Texts (II)

In [None]:
import os
#The OS module in Python provides a way of using operating system dependent functionality. 
#The functions that the OS module provides allows you to interface with the underlying operating system 
#that Python is running on – be that Windows, Mac or Linux.

from os import listdir
from os.path import isfile, join
import nltk
import logging
#Module that records events related to the application’s operation. 
#The log record, which is created with every logging event, contains readily available diagnostic information such as 
#the file name, full path, function, and line number of the logging event.

from collections import Counter
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
#sklearn implements support vector classification; it is part of 
#scikit-learn, a free software machine learning library for Python (tools for data mining and data analysis)

import networkx as nx
import matplotlib.pyplot as plt  
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
#nltk.download('popular', halt_on_error=False)

import re
import codecs
import string
from string import punctuation

In [None]:
pwd

In [None]:
stopwords = nltk.corpus.stopwords.words('stop_words_poetry.txt')

stopwords.append('...')
stopwords.append("'d")
stopwords.append('...')
stopwords.append("&")
stopwords.append("upon")
stopwords.append("also")
stopwords.append("hath")
stopwords.append("must")
stopwords.append("therefore")
stopwords.append("doth")
stopwords.append("could")
stopwords.append("would")
#stopwords.append("another")
stopwords.append("much")
#stopwords.append("give")
stopwords.append("like")
stopwords.append("since")
#stopwords.append("many")
#stopwords.append("without")
#stopwords.append("first")
stopwords.append("though")
#stopwords.append("well")
#stopwords.append("often")
#stopwords.append("great")
stopwords.append("either")
#stopwords.append("even")
stopwords.append("shall")
#stopwords.append("they")
stopwords.append("what")
stopwords.append("their")
#stopwords.append("more")
#stopwords.append("there")
#stopwords.append("your")
#stopwords.append("them")
stopwords.append("’")
stopwords.append("“")
stopwords.append("2")
stopwords.append("3")
stopwords.append("”")

In [None]:
def _pre_clean(list_of_text):
        '''
        preliminary cleaning of the text
        - remove new line character i.e. \n or \r
        - remove tabs i.e. \t
        - remove extra spaces
        '''
        cleaned_list = []
        for text in list_of_text:
            # print("original:", text)
            text = text.replace('\\n', ' ')
            text = text.replace('\\r', ' ')
            text = text.replace('\\t', ' ')
            pattern = re.compile(r'\s+')
            text = re.sub(pattern, ' ', text)
            text = text.strip()
            # check for empty strings
            if text != '' and text is not None:
                cleaned_list.append(text)

        return cleaned_list

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = _pre_clean(tokens)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]
    #tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
HOME = os.getcwd()

TEXTS_DIR = HOME + "/US_Poets_Anthology2/"

#TEXTS_DIR = HOME

filelabels = {}

texts_data = []

files = [f for f in os.listdir(TEXTS_DIR) if os.path.isfile(os.path.join(TEXTS_DIR, f))]

import string
from string import punctuation

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

tokens_total = []

count = -1
 
os.chdir(TEXTS_DIR)
    
for f in files:
    #os.chdir(TEXTS_DIR)
    with open(f, "r", encoding='utf-8', errors = 'ignore') as openf:
        tokens = []
        count = count + 1
        filelabels[count] = os.path.basename(openf.name)
        for line in openf:
            sent_text = nltk.sent_tokenize(line)
            for sentence in sent_text:
                tokens1 = tokenize(sentence)
                tokens1 = [item.translate(remove_punct_map)
                      for item in tokens1]
                #filter_object = filter(lambda x: x != "", tokens1)
                tokens1 = [x for x in tokens1 if x!= ""]
                tokens1 = [x.lower() for x in tokens1]
                for token in tokens1:
                    tokens.append(token)
                    tokens_total.append(token)
                #if random.random() > .99:
                #print(tokens)
    #print(tokens_total)
    texts_data.append(tokens)

print(filelabels)

In [None]:
count = Counter(tokens_total)

In [None]:
len(list(count))

In [None]:
#print(count) It would be a lot to print

import pyperclip as clip

In [None]:
clip.copy(f"{count}")

In [None]:
# Command+V into a page/word/txt file [or clip.paste() to print it here, but in this case it is too large a list to print]

In [None]:
stopwords.extend(['a', 'like', 'you', 'they', 'he', 'be', 'it', 'your', 'her', 'of', 'more', 'there', 'no', 'not', '’', 'what', 'my', 'his', 'she', 'to', 'our', 'me', 'we', 'in', 'can', 'us', 'an', 'if', 'do', 'this', '”', 'because', 'who', 'hand', 'but', 'him'])

In [None]:
tokens_total = [x for x in tokens_total if x not in stopwords]

In [None]:
count = Counter(tokens_total)

In [None]:
len(list(count))

In [None]:
print(count)

In [None]:

for i in range(len(filelabels)):
    print(len(texts_data[i]))

In [None]:
for i in range(len(filelabels)):
    texts_data[i] = [x for x in texts_data[i] if x not in stopwords]

In [None]:
for i in range(len(filelabels)):
    print(len(texts_data[i]))

In [None]:
def get_documents(path):
    os.chdir(path)
    files = [f for f in listdir(path) if isfile(join(path, f))]
    texts = []
    count = -1
    for f in files:
        with codecs.open(f, "r", encoding='utf-8', errors = 'ignore') as openf:
            count = count + 1
            filelabels[count] = os.path.basename(openf.name)
            splitted_lines = openf.read().splitlines()
            splitted_lines = _pre_clean(splitted_lines)
            texts.append(splitted_lines)
    #print(filelabels)
    return texts

In [None]:
documents = get_documents(TEXTS_DIR)

In [None]:
exclude = set(punctuation)

In [None]:
new_documents = []

for document in documents:
    new_document = ""
    for string_ in document:
        exclude = set(string.punctuation)
        string_ = ''.join(ch for ch in string_ if ch not in exclude)
        lower_string = string_.lower()
        new_document = " ".join([new_document, lower_string]) 
    new_documents.append(new_document)
     

In [None]:
print(documents[36])

In [None]:
print(new_documents[36])

In [None]:
print(documents[33])

In [None]:
print(new_documents[33])

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

In [None]:

vectorizer = CountVectorizer(new_documents, stop_words = stopwords)

In [None]:
X = vectorizer.fit_transform(new_documents)

In [None]:
X.shape

In [None]:
Y = X.toarray()

from sklearn.feature_extraction.text import TfidfTransformer

tf_idf_vect = TfidfVectorizer(stop_words = stopwords)

tfidf = tf_idf_vect.fit_transform(new_documents)

#print(type(tfidf))

W = tfidf.toarray()

#print(type(W))

dt = [('correlation', float)]

similarity_matrix = np.matrix((tfidf * tfidf.T).A, dtype=dt)

import networkx as nx

G = nx.from_numpy_matrix(similarity_matrix)

weights = [(G[tpl[0]][tpl[1]]['correlation']) for tpl in G.edges()]


In [None]:
e = [(x, x) for x in G.nodes()] 

G.remove_edges_from(e)


In [None]:
def draw_graph(G):
    weights = [(G[tpl[0]][tpl[1]]['correlation']) for tpl in G.edges()]
    normalized_weights = [400*weight/sum(weights) for weight in weights]
    fig, ax = plt.subplots(figsize=(25, 16))
    pos=nx.spring_layout(G)
    nx.draw_networkx(
        G,
        pos,
        edges=G.edges(),
        width=normalized_weights,
        with_labels=True,
        node_size=70,
        node_color='r',
        alpha=1,
        font_size=22
    )
    #plt.show()
    return

In [None]:
draw_graph(G)

In [None]:
Z = vectorizer.get_feature_names()

In [None]:
print(Z)


Count will print the tokens and their number of occurrences throughout the corpus

In [None]:
print(count)

In [None]:
len(words)

In [None]:
Y.shape

In [None]:
len(list(count))  #tokens counter

In [None]:
X.shape


So the dimensions of X and Y are the same. What is the difference between them?

Les dimensions de X et Y sont donc les mêmes. Quelle est la différence entre elles ?


Also, why is there a difference between the dimension of Y and that of the list of tokens?  [doc vs new_doc?]

Par ailleurs, pourquoi y a-t-il une différence entre la dimension de Y et celle de la liste de tokens ? [doc vs new_doc ?]


How can we find out which word corresponds to a certain index in X or Y?

Comment savoir quel mot correspond à un certain indice dans X ou Y ?

In [None]:
len(Z)

In [None]:
Z[0]

In [None]:
Z[7736]

In [None]:
Z[8001]


How can we find out which indices are not zero for a certain document 
(and thus which of the whole huge set of tokens occur in that document)?

Comment pouvons-nous savoir quels indices ne sont pas nuls pour un certain document 
(et donc lesquels de l'énorme ensemble de tokens apparaissent dans ce document) ?

In [None]:

j = 0
ind0 = []
for i in range(22961):
    if Y[0][i] != 0: 
        ind0.append(i) 
        j += 1
    else:
        pass
print(j) #how many occurence values are not zero in the node number 0
print(ind0) #indeces of tokens with their occurence value different from 0


Can we track down which is which (which word/token corresponds to which index) in the list above? Let us first do one more example. 

Pouvons-nous déterminer qui est qui (quel mot/token correspond à quel index) dans la liste ci-dessus ? Prenons d'abord un autre exemple. 

In [None]:
Degrees = G.degree()
Sorted_degrees = sorted(Degrees, key = lambda t: t[1], reverse = True)
Sorted_degrees[0]

In [None]:
j = 0
ind4 = []
for i in range(22961):
    if Y[4][i] != 0: 
        ind4.append(i) 
        j += 1
    else:
        pass
print(j) #how many occurence values are not zero in the node number 4
print(ind4) #indeces of tokens with their occurence value different from 0


We worked out the calculus for the document (node) with the highest degree and for another random one.

How about the node with the lowest degree?

Nous avons effectué le calcul pour le document (nœud) ayant le plus haut degré et pour un autre nœud aléatoire.

Qu'en est-il du nœud avec le plus petit degré ?

In [None]:
Sorted_degrees[52]

In [None]:
j = 0
ind33 = []
for i in range(22961):
    if Y[33][i] != 0: 
        ind33.append(i) 
        j += 1
    else:
        pass
print(j) #how many occurence values are not zero in the 33rd node 
print(ind33)


We can see here how having a certain position in the graph also speaks 
to the specific NLP-relevant anatomy of that respective document.

Nous pouvons voir ici comment le fait d'avoir une certaine position dans le graphe parle aussi 
de l'anatomie TAL spécifique du document en question.

#### Let us now see what are the indices appearing in both lists (ind4 and ind33). 
We need to compute therefore the intersection of the two sets representing the two lists.

#### Voyons maintenant quels sont les indices apparaissant dans les deux listes (ind4 et ind33). 
Nous devons donc calculer l'intersection des deux ensembles représentant les deux listes.

In [None]:

list(set(ind4).intersection(ind33))


What does this list (of one element) represent? What does the index in it stand for?

Que représente cette liste (d'un élément) ? Que représente l'indice qu'elle contient ?

In [None]:
for i in list(set(ind4).intersection(ind33)):
    print(Z[i])


Is this word (and any other word we look into) of equal importance to the documents it occurs in and to the corpus as a whole?

Ce mot (et tout autre mot que nous étudions) a-t-il la même importance pour les documents dans lesquels il apparaît et pour le corpus dans son ensemble ?


W is the tfidf array for the whole corpus.
W est le tableau tfidf pour l'ensemble du corpus.

In [None]:
j = 0
ind33 = []
for i in range(22961):
    if W[33][i] != 0: 
        ind33.append(i) 
        j += 1
    else:
        pass
print(j) #how many TFIDF values are not zero in node 33
print(ind33)


Oops, tuurns out they are the same as in Y. Is that really weird?

Oups, il s'avère qu'ils sont les mêmes qu'en Y. C'est vraiment bizarre ?


Then what can we gain from examining W closely as compared to sticking only with Y?

Alors, que pouvons-nous gagner en examinant de près W plutôt que de nous en tenir uniquement à Y ?

In [None]:
j = 0
ind33 = []
for i in range(22961):
    if W[33][i] != 0: 
        print((i, W[33][i]))
        ind33.append(i) 
        j += 1
    else:
        pass
print(j) #how many TFIDF values are not zero in node 33

In [None]:
j = 0
TFIDF33 = []
for i in range(22961):
    if W[33][i] != 0: 
        #print(i, W[21][i], Z[i])
        TFIDF33.append((i, W[33][i], Z[i])) 
        j += 1
    else:
        pass
print(TFIDF33)
print(j) #how many TFIDF values are not zero in node 33 


CENTRALITIES


Closeness centrality

In a connected graph, closeness centrality (or closeness) of a node is a measure of centrality in a network, calculated as the sum of the length of the shortest paths between the node and all other nodes in the graph. Thus the more central a node is, the closer it is to all other nodes. 

Dans un graphe connecté, la centralité de proximité (ou proximité) d'un nœud est une mesure de la centralité dans un réseau, calculée comme la somme de la longueur des chemins les plus courts entre le nœud et tous les autres nœuds du graphe. Ainsi, plus un nœud est central, plus il est proche de tous les autres nœuds.

In [None]:

clo_cen = nx.closeness_centrality(G)

In [None]:

from collections import OrderedDict


Let us sort the closeness centralities.
Faisons le tri des centralités de proximité.

In [None]:

c = OrderedDict(sorted(clo_cen.items(), key=lambda t: t[1], reverse=True))

In [None]:

print("Closeness centrality for G:", c)
#print(c)

In [None]:
#Another way of doing the ordering:
import operator 
c = sorted(clo_cen.items(), key=operator.itemgetter(1), reverse=True)
print("Closeness centrality for G:", c)

In [None]:
nx.closeness_centrality(G, 4)

In [None]:
nx.closeness_centrality(G, 33)


So we have 15 nodes of maximum (closeness) centrality. Previously we found out what word occurs both in node 4 and node 33, we can now do the same kind of investigation for these 15 nodes (or for others having identical, close, or strikingly disparate centralities), thus fusing a network-analysis-based feature with an NLP/vector-informed one. What other possible investigation(s)--prompted by the vector space and network related data--could be pursued?

Nous avons donc 15 nœuds de centralité (de proximité) maximale. Auparavant, nous avons découvert quel mot se trouve à la fois dans le nœud 4 et dans le nœud 33, nous pouvons maintenant faire le même genre de recherche pour ces 15 nœuds (ou pour d'autres ayant des centralités identiques, proches ou étonnamment disparates), fusionnant ainsi une caractéristique basée sur l'analyse de réseau avec une caractéristique informée par l'espace vectoriel TAL. Quelle(s) autre(s) investigation(s) possible(s) - poussée(s) par l'espace vectoriel et les données relatives au réseau - pourrait-on mener ?


BETWEENNESS CENTRALITY

In [None]:

bet_cen = nx.betweenness_centrality(G, weight = weight)


Ooops, it turns out we have not defined the weights yet. Did that affect our closeness centrality output?

Oups, il s'avère que nous n'avons pas encore défini les poids. Cela a-t-il affecté notre résultat de centralité de proximité ?


Let us define weight as an attribute.
Définissons le poids comme un attribut.

In [None]:

weights = [(G[tpl[0]][tpl[1]]['correlation']) for tpl in G.edges()]

In [None]:
print(weights)

In [None]:
G[4][33]['correlation']


In graph theory, betweenness centrality is a measure of centrality in a graph based on shortest paths.
Betweenness centrality measures the extent to which a vertex lies on paths between other vertices. Vertices with high betweenness may have considerable influence within a network by virtue of their control over information passing between others. They are also the ones whose removal from the network will most disrupt communications between other vertices because they lie on the largest number of paths taken by messages.

En théorie des graphes, la centralité d'interdépendance est une mesure de la centralité dans un graphe basée sur les chemins les plus courts.
La centralité d'interdépendance mesure la mesure dans laquelle un sommet se trouve sur des chemins entre d'autres sommets. Les sommets ayant une forte centralité d'interdépendance peuvent avoir une influence considérable au sein d'un réseau en raison du contrôle qu'ils exercent sur les informations passant entre les autres. Ils sont également ceux dont la suppression du réseau perturbera le plus les communications entre les autres sommets, car ils se trouvent sur le plus grand nombre de chemins empruntés par les messages.

In [None]:

bet_cen = nx.betweenness_centrality(G, weight = "correlation")

In [None]:

b = OrderedDict(sorted(bet_cen.items(), key=lambda t: t[1], reverse=True)) 

In [None]:

print("Betweenness centrality for G:", b)


In [None]:
b1= list(b)
len(b1)

print(b1[1])

In [None]:
for i in range(len(b1)):
    if b1[i] == 9:
            print(i)

In [None]:

bet = sorted(bet_cen.items(), key=operator.itemgetter(1), reverse=True)
print("Betweenness centrality for G:", bet)

In [None]:
bet[1][0]

In [None]:
for i in range(len(bet)):
    if bet[i][0] == 9:
        print(i, bet[i][1])

In [None]:
c_top = c[0:21]
print(c_top)

In [None]:
bet_top = bet[0:21]
print(bet_top)

In [None]:
c_bottom = []

n = len(c)


In [None]:
for i in range (n-21, n):
    c_bottom.append(c[i])

In [None]:
print(c_bottom)

In [None]:
bet_top_nodes = []
c_bottom_nodes = []

for i in range(21):
    c_bottom_nodes.append(c_bottom[i][0])
    bet_top_nodes.append(bet_top[i][0])

In [None]:

c_bottom_set = set(c_bottom_nodes)
bet_top_set = set(bet_top_nodes)

In [None]:

intersect = c_bottom_set.intersection(bet_top_set)
print(intersect)

In [None]:
s1 = set()

In [None]:
for item in intersect:
     for i in range(len(bet)):
            if bet[i][0] == item:
                print(item, i, bet[i][1])
    
        

In [None]:

for item in intersect:
     for i in range(len(c)):
            if c[i][0] == item:
                print(item, i, c[i][1])


EIGENVECTOR CENTRALITY

In graph theory, eigenvector centrality (also called eigencentrality) is a measure of the influence of a node in a network. Relative scores are assigned to all nodes in the network based on the concept that connections to high-scoring nodes contribute more to the score of the node in question than equal connections to low-scoring nodes. A high eigenvector score means that a node is connected to many nodes who themselves have high scores.

Google's PageRank and the Katz centrality are variants of the eigenvector centrality.

See the math formula here https://bit.ly/2rcP3ie.

En théorie des graphes, la centralité des vecteurs propres (également appelée centralité propre) est une mesure de l'influence d'un nœud dans un réseau. Des scores relatifs sont attribués à tous les nœuds du réseau sur la base du concept selon lequel les connexions aux nœuds à score élevé contribuent davantage au score du nœud en question que des connexions égales aux nœuds à faible score. Un score de vecteur propre élevé signifie qu'un nœud est connecté à de nombreux nœuds qui ont eux-mêmes des scores élevés.

Le PageRank de Google et la centralité de Katz sont des variantes de la centralité du vecteur propre.

Voir la formule mathématique ici https://bit.ly/2rcP3ie.

In [None]:
eig_cen = nx.eigenvector_centrality(G)

e = OrderedDict(sorted(eig_cen.items(), key=lambda t: t[1], reverse=True))

print("Eigenvector centrality for G:", e)
