In [44]:
# Imports
import json
import networkx as nx
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import matplotlib.patches as mpatches
import community as community_louvain
from sklearn.metrics import confusion_matrix
from joblib import Parallel, delayed
from netwulf import visualize
from numpy.dtypes import StringDType
import nltk
#nltk.download(book)
import re
import ast
from nltk.corpus import stopwords
from IPython.display import Image, display

These exercises are taken from Week 8
> __Exercise 1: TF-IDF and the Computational Social Science communities.__ The goal for this exercise is to find the words charachterizing each of the communities of Computational Social Scientists.
> What you need for this exercise: 
>    * The assignment of each author to their network community, and the degree of each author (Week 6, Exercise 4). This can be stored in a dataframe or in two dictionaries, as you prefer.  
>    * the tokenized _abstract_ dataframe (Week 7, Exercise 2)
>
> 1. First, check out [the wikipedia page for TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). Explain in your own words the point of TF-IDF. 
>   * What does TF stand for? 
>   * What does IDF stand for?
> 2. Now, we want to find out which words are important for each *community*, so we're going to create several ***large documents, one for each community***. Each document includes all the tokens of abstracts written by members of a given community. 
>   * Consider a community _c_
>   * Find all the abstracts of papers written by a member of community _c_.
>   * Create a long array that stores all the abstract tokens 
>   * Repeat for all the communities. 
> __Note:__ Here, to ensure your code is efficient, you shall exploit ``pandas`` builtin functions, such as [``groupby.apply``](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.GroupBy.apply.html) or [``explode``](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.explode.html).
> 3. Now, we're ready to calculate the TF for each word. Use the method of your choice to find the top 5 terms within the __top 5 communities__ (by number of authors). 
>   * Describe similarities and differences between the communities.
>   * Why aren't the TFs not necessarily a good description of the communities?
>   * Next, we calculate IDF for every word. 
>   * What base logarithm did you use? Is that important?
> 4. We're ready to calculate TF-IDF. Do that for the __top 9 communities__ (by number of authors). Then for each community: 
>   * List the 10 top TF words 
>   * List the 10 top TF-IDF words
>   * List the top 3 authors (by degree)
>   * Are these 10 words more descriptive of the community? If yes, what is it about IDF that makes the words more informative?



### 3.1.1:

#### Author degree and community dataframe

In [None]:
# Import the graph\n",
G = read_json_graph("coauthorship_network_no_nan.json")

# Print total number of authors
print(f"Total number of authors: {G.number_of_nodes()}")

In [None]:
# Compute the best partition using the Louvain algorithm
partition = community_louvain.best_partition(G)

print(f"Number of communities found by the Louvain algorithm: {len(set(partition.values()))}")

In [None]:
partition_modularity(G, partition)

In a dataframe, we now store the relevant data for each author entry, that is `degree` and `community` for each `author`:

In [None]:
G_degree = dict(G.degree())
G_community = partition

df_author_deg_com = pd.DataFrame({
    "author": list(G_degree.keys()),
    "degree": list(G_degree.values()),
    "community": list(G_community.values()),
})

df_author_deg_com

#### Tokenized abstract dataframe

Load dataframes:

In [49]:
# Load abstracts dataframe
abstracts_df = pd.read_json("abstracts.json")

In [50]:
df_articles = pd.read_csv("df_articles.csv")

# Fix string as list\n",
df_articles["author_ids"] = df_articles["author_ids"].apply(lambda x: x[1:-1].split(','))

To clean up the data and keep things nice and tidy, we first store make sure that there are no undefined rows/entries across the two relevant dataframes. To avoid having to work with multiple dataframes, we store all relevant info in a single dataframe. Finally, we remove any undefined (`NaN` or `None`) rows/entries:

In [51]:
# Step 1: Keep only entries in abstracts_df where 'id' is also in df_articles
abstracts_df = abstracts_df[abstracts_df["id"].isin(df_articles["id"])]

# Step 2: Merge df_articles into abstracts_df to add 'author_ids'
abstracts_df = abstracts_df.merge(df_articles[["id", "author_ids"]], on="id", how="left")

# Step 3: Remove entries where 'abstract_inverted_index' is NaN or None
abstracts_df = abstracts_df[abstracts_df["abstract_inverted_index"].notna()]

abstracts_df = abstracts_df.reset_index(drop=True)

To know the length of each abstract, we can collect the max index and pre-allocate an array with empty strings to speed things up. We can use the `map` method to "vectorize" operations:

In [52]:
def construct_abstract(abs_inv_idx):
    if not abs_inv_idx:
        return ""

    # Get max index for correct allocation
    max_idx = max(max(i) for i in abs_inv_idx.values())

    # Pre-allocate an array with empty strings (NumPy for efficiency)
    word_list = np.full(max_idx + 1, "", dtype=StringDType())

    # Vectorized-like word placement (avoiding Python loops where possible)
    for word, positions in abs_inv_idx.items():
        word_list[positions] = word  # Direct NumPy assignment

    return " ".join(word_list)

# Apply function to dataframe (Pandas vectorized apply)
abstracts_df["text"] = abstracts_df["abstract_inverted_index"].map(construct_abstract)

abstracts_df

Unnamed: 0,id,title,abstract_inverted_index,author_ids,text
0,https://openalex.org/W3103362336,Power-Law Distributions in Empirical Data,"{'Power-law': [0], 'distributions': [1], 'occu...","['A5014647140', 'A5082953212', 'A5067142016']",Power-law distributions occur in many situatio...
1,https://openalex.org/W2047940964,Finding community structure in very large netw...,"{'The': [0, 147], 'discovery': [1], 'and': [2,...","['A5014647140', 'A5067142016', 'A5008033989']",The discovery and analysis of community struct...
2,https://openalex.org/W2018045523,Hierarchical Organization of Modularity in Met...,"{'Spatially': [0], 'or': [1], 'chemically': [2...","['A5007285525', 'A5067021466', 'A5029755266'...",Spatially or chemically isolated functional mo...
3,https://openalex.org/W2119298903,Evaluating Online Labor Markets for Experiment...,"{'We': [0, 16, 32, 57, 69], 'examine': [1], 't...","['A5054913386', 'A5065660380', 'A5065503150']",We examine the trade-offs associated with usin...
4,https://openalex.org/W1987228002,Limits of Predictability in Human Mobility,"{'Predictable': [0], 'Travel': [1], 'Routines'...","['A5100744117', 'A5080830598', 'A5022334515'...",Predictable Travel Routines While people rarel...
...,...,...,...,...,...
10247,https://openalex.org/W2963738505,Connecting Continuum of Care point-in-time hom...,"{'In': [0], '2007,': [1], 'the': [2, 14, 17, 2...","['A5026370090', 'A5022446382', 'A5101133514']","In 2007, the Department of Housing and Urban D..."
10248,https://openalex.org/W3082044992,The selection problem for some first-order sta...,"{'Here,': [0], 'we': [1, 48, 78, 85, 103], 'st...","['A5087528940', 'A5113876236', 'A5078683541']","Here, we study the existence and the convergen..."
10249,https://openalex.org/W1972032782,Structural Characterization and Thermal Proper...,"{'1-amino-1-ethylamino-2,2-dinitroethylene': [...","['A5100440745', 'A5103142430', 'A5083702049'...","1-amino-1-ethylamino-2,2-dinitroethylene (AEFO..."
10250,https://openalex.org/W124819492,Duality Principles for Fully Nonlinear Ellipti...,"{'In': [0], 'this': [1], 'paper': [2], 'we': [...",['A5087528940'],In this paper we use duality theory to associa...


Then we tokenize the abstracts dataframe, again using a dedicated function. We also utilize a combined pattern with the OR operator to be able to make only a single method/function call for the text substitution:

In [53]:
porter = nltk.PorterStemmer() # use porter stemmer
stopwords_set = set(stopwords.words("english"))

# Regex patterns
regex_punctuation = r"[^\w\s]|_" # punctuation (any character that's not a letter, number or whitespace)
regex_url = r"https?://\S+|www\.\S+" # for URLs
regex_math = r"[+\-*/=<>×÷∑∞∂πθ√∫≈≠]" # for math symbols and operators
#regex_numbers = r"\b\d+(\.\d+)?\b" # for numbers
regex_numbers = r"\d+(\.\d+)?" # numbers inlcuding letters combined with numbers

# Combine pattern with the OR operator "|"
pattern = re.compile(r"|".join([regex_punctuation, regex_url, regex_math, regex_numbers]))

def tokenization(text):
    # Apply regex substitutions once
    text = pattern.sub("", text).lower()
    
    # Tokenize the text efficiently
    tokens = nltk.word_tokenize(text)
    
    # Filter stopwords and apply stemming in one pass
    return [porter.stem(word) for word in tokens if word not in stopwords_set]

In [None]:
# Apply tokenization algorithm to new column
abstracts_df["tokens"] = abstracts_df["text"].apply(tokenization)

abstracts_df

Unnamed: 0,id,title,abstract_inverted_index,author_ids,text,tokens
0,https://openalex.org/W3103362336,Power-Law Distributions in Empirical Data,"{'Power-law': [0], 'distributions': [1], 'occu...","['A5014647140', 'A5082953212', 'A5067142016']",Power-law distributions occur in many situatio...,"[powerlaw, distribut, occur, mani, situat, sci..."
1,https://openalex.org/W2047940964,Finding community structure in very large netw...,"{'The': [0, 147], 'discovery': [1], 'and': [2,...","['A5014647140', 'A5067142016', 'A5008033989']",The discovery and analysis of community struct...,"[discoveri, analysi, commun, structur, network..."
2,https://openalex.org/W2018045523,Hierarchical Organization of Modularity in Met...,"{'Spatially': [0], 'or': [1], 'chemically': [2...","['A5007285525', 'A5067021466', 'A5029755266'...",Spatially or chemically isolated functional mo...,"[spatial, chemic, isol, function, modul, compo..."
3,https://openalex.org/W2119298903,Evaluating Online Labor Markets for Experiment...,"{'We': [0, 16, 32, 57, 69], 'examine': [1], 't...","['A5054913386', 'A5065660380', 'A5065503150']",We examine the trade-offs associated with usin...,"[examin, tradeoff, associ, use, amazoncom, mec..."
4,https://openalex.org/W1987228002,Limits of Predictability in Human Mobility,"{'Predictable': [0], 'Travel': [1], 'Routines'...","['A5100744117', 'A5080830598', 'A5022334515'...",Predictable Travel Routines While people rarel...,"[predict, travel, routin, peopl, rare, perceiv..."
...,...,...,...,...,...,...
10247,https://openalex.org/W2963738505,Connecting Continuum of Care point-in-time hom...,"{'In': [0], '2007,': [1], 'the': [2, 14, 17, 2...","['A5026370090', 'A5022446382', 'A5101133514']","In 2007, the Department of Housing and Urban D...","[depart, hous, urban, develop, initi, pointint..."
10248,https://openalex.org/W3082044992,The selection problem for some first-order sta...,"{'Here,': [0], 'we': [1, 48, 78, 85, 103], 'st...","['A5087528940', 'A5113876236', 'A5078683541']","Here, we study the existence and the convergen...","[studi, exist, converg, solut, vanish, discoun..."
10249,https://openalex.org/W1972032782,Structural Characterization and Thermal Proper...,"{'1-amino-1-ethylamino-2,2-dinitroethylene': [...","['A5100440745', 'A5103142430', 'A5083702049'...","1-amino-1-ethylamino-2,2-dinitroethylene (AEFO...","[aminoethylaminodinitroethylen, aefox, synthes..."
10250,https://openalex.org/W124819492,Duality Principles for Fully Nonlinear Ellipti...,"{'In': [0], 'this': [1], 'paper': [2], 'we': [...",['A5087528940'],In this paper we use duality theory to associa...,"[paper, use, dualiti, theori, associ, certain,..."


#### TF-IDF

- **TF:** Term frequency, calculates the relative frequency of some term $t$ within a document $d$: $$\mathrm{tf}(t, d) = \frac{f_{t,d}}{\sum_{t' \in d} f_{t',d}} = \frac{\text{count of term } t \text{ in document } d}{\text{total number of terms in document } d}$$
- **IDF:** Inverse document frequency, a measure rare a term $t$ is across all documents in a corpus $D$: $$\mathrm{idf}(t, D) = \log\left(\frac{N}{| \{ d : d \in D \text{ and } t \in d \} |}\right) = \log\left(\frac{\text{total number of documents in the corpus}}{\text{number of documents where the term appears}}\right)$$
- **TF-IDF:** The term frequency-inverse frequency is then calculated as: $$\text{tf-idf}(t, d, D) = \mathrm{tf}(t, d) \cdot \mathrm{idf}(t, D)$$

All in all, what we get is an NLP statistic that quantifies the importance of a word in a document relative to the corpus of documents.

We note that there are other ways to compute this statistic, but the formulas were included here for an intuitive explanation.