In [1]:
import os
import requests
import time
import zipfile
import shutil
import xmltodict
import json
from tqdm import tqdm  # Optional: for progress tracking

In [2]:
data_url = "https://www.nsf.gov/awardsearch/download?DownloadFileName=2020&All=true"
file_name = "2020.zip"

# Download the zip file, unzip, and create the Json file

In [4]:
# Step 1: Download the file
def download_file(url, download_path):
    response = requests.get(url, stream=True)
    with open(download_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    print(f"File downloaded to {download_path}")

In [5]:
# Step 2: Move the file
def move_file(src_path, dest_folder):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    dest_path = os.path.join(dest_folder, os.path.basename(src_path))
    shutil.move(src_path, dest_path)
    print(f"File moved to {dest_path}")
    return dest_path

In [6]:
# Step 3: Unzip the file
def unzip_and_delete_downloaded_file(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"File {zip_path} unzipped to {extract_to}")
    os.remove(zip_path)
    print(f"File {zip_path} deleted")

In [7]:
# Step 4: Unify the data into a json file
def unify_file(data_path, dest_folder):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    # List to store JSON data from all XML files
    all_data = []
    
    # Loop through each file in the folder
    for xml_file_name in tqdm(os.listdir(data_path), desc="Processing XML files"):
        if xml_file_name.endswith('.xml'):
            xml_file_path = os.path.join(data_path, xml_file_name)
            
            # Read and parse the XML file
            with open(xml_file_path, 'r', encoding='utf-8') as xml_file:
                xml_content = xml_file.read()
                try:
                    # Convert XML to a dictionary
                    xml_dict = xmltodict.parse(xml_content)
                    
                    # Append the parsed XML data (as a dictionary) to the list
                    all_data.append(xml_dict)
                except Exception as e:
                    print(f"Error processing file {xml_file_name}: {e}")

    # Write the accumulated data to a single JSON file
    dest_path = os.path.join(dest_folder, 'data.json')
    with open(dest_path, 'w', encoding='utf-8') as json_file:
        json.dump(all_data, json_file, indent=4)

    print(f"File {dest_path} created")
    

In [8]:
# Automating the entire process
def automate_download_and_unzip(url, file_name):
    # Define paths
    download_folder = os.path.expanduser("~/Downloads")  # Adjust this if needed
    download_file_name = "nsf_download.zip"
    download_path = os.path.join(download_folder, file_name)
    
    data_folder = "data"  # Change this to your desired folder path
    extract_to = os.path.join(data_folder, "unzipped_files")  # Destination folder for unzipped content
    
    # Step 1: Download the file
    download_file(url, download_path)
    time.sleep(10)  # Adjust time if needed
       
    # Step 2: Move the file
    moved_path = move_file(download_path, data_folder)
    
    # Step 3: Unzip the file
    unzip_and_delete_downloaded_file(moved_path, extract_to)

    # Step 4: Unify the downloads into a Json file
    unify_file(extract_to, data_folder)

# Read the json file and create the unique abstract list

In [10]:
import json
from pydantic import BaseModel, Field, AliasPath
from typing import Optional

In [11]:
class DataModel(BaseModel):
    award_id: Optional[int] = Field(validation_alias=AliasPath('rootTag', 'Award', 'AwardID'))
    award_title: str = Field(validation_alias=AliasPath('rootTag', 'Award', 'AwardTitle'))
    abstract: Optional[str] = Field(validation_alias=AliasPath('rootTag', 'Award', 'AbstractNarration'))

In [12]:
def create_unique_abstract_list(json_file = 'data/data.json'):
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    model = [DataModel(**item).dict(exclude_none=True) for item in data]
    
    abstracts_set = {entry.get('abstract') for entry in model if entry.get('abstract') and entry.get('abstract').strip()}
    unique_abstract_list = list(abstracts_set)
    return unique_abstract_list

# Create the stopword list

In [14]:
import nltk
from nltk.stem.snowball import SnowballStemmer

In [15]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
stemmed_stopwords = [stemmer.stem(w) for w in stopwords]
stemmed_stopwords_set = {entry for entry in stemmed_stopwords}
unique_stemmed_stopwords = list(stemmed_stopwords_set)

# Preprocessing the abstracts

In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [18]:
def preprocess_text(text, stop_words):
    lemmatizer = WordNetLemmatizer()
    
    # Use regex to remove specific substrings
    text = re.sub(r'<br\s*/?>', ' ', text)  # Remove <br/> or <br>
    text = re.sub(r'&lt;br/&gt;', ' ', text)  # Remove &lt;br/&gt;
    text = re.sub(r"This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria", ' ', text)
    
    # Tokenize and lowercase
    word_tokens = word_tokenize(text.lower())
    
    # Filter out stop words before lemmatization
    filtered_before_lemmatization = [w for w in word_tokens if w.isalpha() and w not in stop_words]
    
    # Lemmatize tokens
    lemmatized_text = [lemmatizer.lemmatize(token) for token in filtered_before_lemmatization]
    
    # Filter out stop words after lemmatization
    filtered_after_lemmatization = [w for w in lemmatized_text if w not in stop_words]
    
    # Join processed words into final text
    processed_text = ' '.join(filtered_after_lemmatization)
    return processed_text

# Vectorize

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
def get_tfidf_matrix(n_features, text):
    vectorizer = TfidfVectorizer(max_features=1000)
    matrix  = vectorizer.fit_transform(preprocessed_abstracts)
    return matrix

# Clustering

In [23]:
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE

In [24]:
def get_clusters(tfidf_matrix, n_components, num_clusters):
    svd = TruncatedSVD(n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    
    X = lsa.fit_transform(tfidf_matrix)
    
    # Using TSNE to reduce dimensionality to 2 for visualization
    X = TSNE(n_components=2).fit_transform(X)

    clusterer = KMeans(n_clusters=num_clusters)
    cluster_labels = clusterer.fit_predict(X)
    
    clustered_data = {}
    for i, label in enumerate(cluster_labels):
        if label not in clustered_data:
            clustered_data[label] = []
        clustered_data[label].append(preprocessed_abstracts[i])
    
    return clustered_data

# Get top words for each cluster

In [26]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
def get_top_words(cluster_texts, n_top_words):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(cluster_texts)
    word_counts = np.asarray(X.sum(axis=0)).flatten()  # Get word counts across all texts
    vocab = vectorizer.get_feature_names_out()  # List of words (vocabulary)
    
    # Get top words
    word_freq = [(word, word_counts[idx]) for idx, word in enumerate(vocab)]
    top_words = sorted(word_freq, key=lambda x: x[1], reverse=True)[:n_top_words]
    
    return top_words

# Main

In [29]:
import numpy as np

In [30]:
json_file = 'data/data.json'

custom_stop_words = ['project', 'data', 'research', 'student', 'award', 'program', 'using', 'impact', 'new', 'support', 'nsf', 'foundation', 'study', 'science', 'use', 'develop', 'development']
stop_words = set(stopwords.words('english')).union(custom_stop_words)

n_features = 1000

n_components = 100
num_clusters = 5

cluster_keywords = 20

In [31]:
# Download the zip file, unzip, and create the Json file
if not os.path.isfile(json_file):
    automate_download_and_unzip(data_url, file_name)   

# Read the json file and create the unique abstract list
unique_abstracts = create_unique_abstract_list(json_file)
preprocessed_abstracts = [preprocess_text(a, stop_words) for a in tqdm(unique_abstracts, desc="Preprocessing the abstracts")]
tfidf_matrix = get_tfidf_matrix(n_features, preprocessed_abstracts)
clustered_data = get_clusters(tfidf_matrix, n_components, num_clusters)

for cluster_id, texts in clustered_data.items():
    print(f"\nTop {cluster_keywords} words in Cluster {cluster_id}:")
    top_words = get_top_words(texts, cluster_keywords)
    for word, freq in top_words:
        print(f"{word}: {freq}")

File downloaded to C:\Users\Eduardo Rodriguez/Downloads\2020.zip
File moved to data\2020.zip
File data\2020.zip unzipped to data\unzipped_files
File data\2020.zip deleted


Processing XML files: 100%|██████████| 13300/13300 [01:55<00:00, 115.12it/s]


File data\data.json created


Preprocessing the abstracts: 100%|██████████| 11507/11507 [00:41<00:00, 279.46it/s]



Top 20 words in Cluster 3:
model: 3303
learning: 2949
application: 2563
problem: 2416
method: 2145
algorithm: 2030
theory: 1941
design: 1857
technology: 1675
technique: 1626
tool: 1540
analysis: 1507
approach: 1437
machine: 1408
computing: 1396
work: 1393
network: 1385
information: 1354
goal: 1230
provide: 1227

Top 20 words in Cluster 1:
material: 5967
quantum: 3419
energy: 2735
property: 2403
technology: 2241
process: 2226
model: 2169
structure: 2010
understanding: 1990
application: 1897
device: 1864
high: 1857
design: 1800
field: 1775
used: 1765
chemical: 1679
undergraduate: 1574
method: 1552
fundamental: 1532
physic: 1519

Top 20 words in Cluster 2:
cell: 3215
model: 2715
water: 2586
change: 2407
understanding: 2223
process: 2101
plant: 2077
specie: 1906
protein: 1881
community: 1877
ocean: 1781
provide: 1735
climate: 1603
used: 1515
undergraduate: 1487
result: 1403
work: 1361
method: 1267
gene: 1243
training: 1232

Top 20 words in Cluster 0:
stem: 5738
learning: 3838
education: 3