# Jupyter Notebook for scraping then analyzing NPS thesis abstracts using ChatGPT 3.5

Code was constructed to get thesis abstracts from Naval Postgraduate School (NPS) site and classify them into categories based on the content of their abstracts into specific categories. We ended up first having a browser based ChatGPT 4o build 10 separate and distinct categories to split the data into and define them strictly enough to encourage our prompt to not misclassify. This was dont iteratively and 10 categories was arbitrary. This method is hopefully scalable to other categorizations such as critical technology areas as defined by OUSD (R&E) or by Joint Warfighting Functions. This will hopefully be a reusable method to ensure abstracts can be represented in the aggregate to outside stakeholders by their topic. Currently, only anecdotal research topics are shared by NPS, which could benefit from displaying topic areas cumulatively to give stakeholders a better sense of the defense focused education and impact the theses have on advancing research in relevant areas. 

## Step 1: Build scrape method 

In [71]:
import requests

def scrape(int_page, int_size, int_year):
    # Define the URL and query parameters
    url = "https://calhoun.nps.edu/server/api/discover/browses/dateissued/items"
    params = {
        "scope": "",
        "sort": "default,ASC",
        "page": int_page,
        "size": int_size,
        "startsWith": int_year,
        "embed": "thumbnail"
    }

    # Define the headers
    headers = {
        "Accept": "application/json, text/plain, */*",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en;q=1,en-US;q=0.1,enq=0.09",
        "Content-Type": "application/json; charset=utf-8",
        "Cookie": "DSpace-XSRF-COOKIE=; other-cookies-here",
        "Referer": "https://calhoun.nps.edu/browse/dateissued?scope=",
        "Sec-Ch-Ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": "\"Windows\"",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "X-Xsrftoken": ""
    }


    # Make the API call
    response = requests.get(url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON data
        data = response.json()
    else:
        print(f"Request failed with status code {response.status_code}")
    return data


## Step 2: Build JSON parsing method

In [72]:
list_data_headers = ['dc.description.abstract',
                     'dc.contributor.author',
                     'dc.contributor.department',
                     'dc.date.issued',
                     'dc.description.service',
                     'dc.identifier.curriculumcode',
                     'dc.title',
                     'dc.type',
                     'etd.thesisdegree.level',
                     'etd.thesisdegree.discipline',
                     'etd.thesisdegree.name',
                     ]
def get_data(json_data, list_num_titles):  
    dict_results = {}
    for num in list_num_titles:
        dict_results[num] = {}
        for header in list_data_headers:
            try:
                dict_results[num][header] = json_data['_embedded']['items'][num]['metadata'][header][0]['value']
            except (KeyError, IndexError) as e:
                dict_results[num][header] = "unknown"
            #dict_results[num][header] = json_data['_embedded']['searchResult']['_embedded']['objects'][num]['_embedded']['indexableObject']['metadata'][header][0]['value']
    return dict_results

## Step 3: Call for all abstracts for 2019-2024, collect relevant metadata, and save to CSVs

In [73]:
import csv

for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    data = scrape(0, 1000, year)
    results = get_data(data, list(range(1000)))
    headers = results[0].keys()

    # Open a new CSV file for writing with BOM
    file_name = f'thesis_data_{year}.csv'
    with open(file_name, 'w', newline='', encoding='utf-8-sig') as csvfile:
        # Create a CSV writer object
        csvwriter = csv.DictWriter(csvfile, fieldnames=headers)
        
        # Write the header row
        csvwriter.writeheader()
        
        # Write the data rows
        for item in results.values():
            csvwriter.writerow(item)
            

    print(f"Data has been written to {file_name}")

Data has been written to thesis_data_2019.csv
Data has been written to thesis_data_2020.csv
Data has been written to thesis_data_2021.csv
Data has been written to thesis_data_2022.csv
Data has been written to thesis_data_2023.csv
Data has been written to thesis_data_2024.csv


## Step 4: After observing CSVs, do some initial cleaning and consolidate into 1 CSV

In [85]:
import os
import pandas as pd

# Get the current directory
directory = os.getcwd()

# Output file
output_file = 'thesis_data_2019-2024.csv'

# Initialize an empty DataFrame to hold the consolidated data
consolidated_df = pd.DataFrame()

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.startswith('thesis_data_') and filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(directory, filename))
        
        # Filter out rows where the 'abstract' column is 'unknown'
        df_filtered = df[df['dc.description.abstract'] != 'unknown']
        
        # Append the filtered DataFrame to the consolidated DataFrame
        consolidated_df = pd.concat([consolidated_df, df_filtered], ignore_index=True)

# Write the consolidated DataFrame to a CSV file
consolidated_df.to_csv(output_file, index=False)


## Step 5: Further cleaning of newly consolidated data

In [92]:
input_file = 'thesis_data_2019-2024.csv'
output_file = 'thesis_data_2019-2024_fixed.csv'

# Read the CSV file and process each row
with open(input_file, 'r', newline='', encoding='utf-8-sig') as infile:
    reader = csv.reader(infile)
    cleaned_data = []
    
    for row in reader:
        cleaned_row = [field.replace('\n', ' ').replace('\r', '') for field in row]
        cleaned_data.append(cleaned_row)

# Write the cleaned data to a new CSV file
with open(output_file, 'w', newline='', encoding='utf-8-sig') as outfile:
    writer = csv.writer(outfile)
    writer.writerows(cleaned_data)


## Step 6: Build method and supporting prompt to have ChatGPT 3.5 categorize all abstracts

In [2]:
import pandas as pd
import openai

client = openai.OpenAI(api_key = '')

# Load the data
abstracts_df = pd.read_csv('thesis_data_2019-2024_fixed.csv')

# Define critical technology areas
critical_technology_areas = [
    "Artificial Intelligence (AI)",
    "Autonomy",
    "Quantum Science",
    "Hypersonics",
    "Directed Energy",
    "Microelectronics",
    "Cybersecurity",
    "Biotechnology",
    "Space Technology",
    "Advanced Materials",
    "Integrated Network Systems-of-Systems",
    "Renewable Energy Generation and Storage",
    "None/Not Clear"
    # Add more areas as needed
]
innovation_areas = [
    "Naval Engineering",
    "Combat Systems",
    "Cyber and Information Systems",
    "Data Science and Decisions",
    "Global Security & Strategic Competition",
    "Defense Systems Management",
    "Space Technology and Operations",
    "Maritime Battlespace Environments",
    "Modeling, Simulation, & Visualization"
    "C-C5ISRT",
    "Long Range Fires",
    "Terminal Defense",
    "Contested Logistics",
    "Maritime Domain Awareness",
    "Artificial Intelligence",
    "Intelligent Autonomous Systems",
    "Naval Operational Architecture",
    "Modeling & Simulation GEMS/LVC",
    "Energy & Climate Security",
    #"Public Policy Analysis",
    #"Emergency Management",
    #"Recruiting and Retention",
    #"Civilian Agency Analysis",
    "Definitely None",
    "Not Clear"
]

research_areas = [
    'Cybersecurity and Information Warfare',
    'Human Capital and Leadership',
    'Operational Strategies and Tactics',
    'Defense Technology and Innovation',
    'Supply Chain and Logistics',
    'National Security and Defense Policy',
    'Environmental and Climate Impact',
    'Military Health and Safety',
    'Intelligence and Surveillance',
    'Policy and Governance',
]

#file_path = 'system_prompt.txt'
#file_path = 'prompt_ccr.txt'
#file_path = 'prompt_innovation.txt'
file_path = 'prompt_my_own.txt'

with open(file_path, 'r') as file:
    system_prompt = file.read()

# Function to classify an abstract using GPT-3.5-turbo
def classify_abstract(abstract, areas):
    prompt = (
        #f"Classify the following abstract into one or more of these critical technology areas: {', '.join(areas)}. "
        f"Classify the following abstract into only one of these specific defense-related research categories: {', '.join(areas)}. "
        f"Provide a classification that best matches the content of the abstract. Provide no other explanation, just the area you think fit the abstract best.\n\n"
        f"Abstract: {abstract}\n\nClassification:"
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        max_tokens=50,
        temperature=0.3
    )
    classifications = response.choices[0].message.content
    return classifications

# Apply the function to each abstract
abstracts_df['Classifications'] = abstracts_df['dc.description.abstract'].apply(lambda x: classify_abstract(x, research_areas))

# Save the processed data
abstracts_df.to_csv('processed_abstracts_3Jun24.csv', index=False, encoding='utf-8-sig')
