# Subject Correlations & Year Counts

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, pickle, string, json
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import chain
from scipy import stats
import matplotlib.pyplot as plt

import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter


# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('JQA_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from JQA_XML_parser import *


# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'JQA_XML_parser'

## Gather XML Files

In [None]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/Data"
files = glob.glob(abs_dir + "/PSC/JQA/*/*.xml")

len(files)

## Build Dataframe

In [None]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

df.head(3)

## Count Subject Headings by Year

In [None]:
%%time

# Extract month, year from date.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'ignore')
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by month, year and count subjects.
subjects = df.groupby(['year', 'subject'], as_index = False)['subject'] \
    .size() \
    .reset_index()

subjects.columns = ['year', 'subject', 'count']

subjects.to_csv(abs_dir + 'GitHub/dsg-mhs/lab_space/data/subjects/subject-year-count.csv',
                sep = ',', index = False)

## Create Adjacency Matrix of Subjects

In [None]:
%%time

# Create adjacency matrix.
adj = pd.crosstab(df['entry'], df['subject'])

# Convert entry-person matrix into an adjacency matrix of persons.
adj = adj.T.dot(adj)

# Change same-same connections to zero.
np.fill_diagonal(adj.values, 0)

# Simple correlation matrix from dataframe.
adj = adj.corr()

adj

In [None]:
%time

sns.clustermap(adj, z_score = 1)

plt.figure(figsize=(16, 12))

## Save Subject Adj. as Network Object

In [None]:
%%time

adj['source'] = adj.index

df = pd.melt(adj, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .query('(source != target) & (weight > 0.65)')

df

## Create Graph Object

In [None]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
nodes = pd.DataFrame(nodes, columns = ['source'])
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')


# Find communities.
communities = community.greedy_modularity_communities(G)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

## Write Graph Object

In [None]:
%%time


# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)
    
data_json = json.dumps(data)

with open(abs_dir + "Data/Output/Graphs/JQA_Network_correlation/jqa-subjects-network.json", "w") as f:
    f.write(data_json)