In [2]:
import os
import shutil
import json
from itertools import islice

# List of directories containing the raw files
paths = ["2018", "2019", "2020", "2021", "2022", "2023"]

# Destination base directory to store files with .json extension
destination_base = "JsonFile"

In [None]:
# Ensure the destination base directory exists
os.makedirs(destination_base, exist_ok=True)

def process_chunk(files):
    results = []
    for file_path, destination_path in files:
        try:
            # Check if the file is a valid JSON
            with open(file_path, 'r') as f:
                json.load(f)  # Validate JSON format
            
            # Add .json extension and move the file
            new_file_name = f"{os.path.basename(file_path)}.json"
            new_file_path = os.path.join(destination_path, new_file_name)
            shutil.copy(file_path, new_file_path)
            results.append(f"Processed: {file_path} -> {new_file_path}")
        except json.JSONDecodeError:
            results.append(f"Skipped (invalid JSON): {file_path}")
    return results

def chunked_iterable(iterable, chunk_size):
    """Yield successive chunks from an iterable."""
    it = iter(iterable)
    while chunk := list(islice(it, chunk_size)):
        yield chunk

# Prepare a list of files to process
tasks = []
for path in paths:
    destination_path = os.path.join(destination_base, path)
    os.makedirs(destination_path, exist_ok=True)

    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        tasks.append((file_path, destination_path))

# Process files in chunks
chunk_size = 100  # Adjust chunk size based on memory and performance
for chunk in chunked_iterable(tasks, chunk_size):
    results = process_chunk(chunk)
    for result in results:
        print(result)

Processed: 2021\202100000 -> JsonFile\2021\202100000.json
Processed: 2021\202100001 -> JsonFile\2021\202100001.json
Processed: 2021\202100002 -> JsonFile\2021\202100002.json
Processed: 2021\202100003 -> JsonFile\2021\202100003.json
Processed: 2021\202100004 -> JsonFile\2021\202100004.json
Processed: 2021\202100005 -> JsonFile\2021\202100005.json
Processed: 2021\202100006 -> JsonFile\2021\202100006.json
Processed: 2021\202100007 -> JsonFile\2021\202100007.json
Processed: 2021\202100008 -> JsonFile\2021\202100008.json
Processed: 2021\202100009 -> JsonFile\2021\202100009.json
Processed: 2021\202100010 -> JsonFile\2021\202100010.json
Processed: 2021\202100011 -> JsonFile\2021\202100011.json
Processed: 2021\202100012 -> JsonFile\2021\202100012.json
Processed: 2021\202100013 -> JsonFile\2021\202100013.json
Processed: 2021\202100014 -> JsonFile\2021\202100014.json
Processed: 2021\202100015 -> JsonFile\2021\202100015.json
Processed: 2021\202100016 -> JsonFile\2021\202100016.json
Processed: 202

In [5]:
import dask.bag as db
import dask.dataframe as dd
import pandas as pd
import json
from pandas import json_normalize

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

year_paths = ["2018", "2019", "2020", "2021", "2022", "2023"]

year = year_paths[5]

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [53]:
def flatten_json(file_path):
    # Open and load the JSON file
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Define the fields you want to flatten
    fields_to_flatten = [
        'item', 'affiliation', 'coredata', 'idxterms', 
        'language', 'authkeywords', 'subject-areas', 'authors'
    ]
    
    # Create an empty dictionary to store flattened data
    flattened_data = {}
    
    # Retrieve the main response
    response = data.get('abstracts-retrieval-response', {})
    
    # Flatten each specified field
    for field in fields_to_flatten:
        if field in response and response[field] is not None:
            try:
                # Handle different types of data
                if isinstance(response[field], list):
                    # For list fields, flatten the first item or concatenate
                    if response[field]:
                        flattened_field = json_normalize(response[field][0] if response[field] else {})
                elif isinstance(response[field], dict):
                    # For dictionary fields
                    flattened_field = json_normalize(response[field])
                else:
                    # For simple fields, create a single-column DataFrame
                    flattened_field = pd.DataFrame({field: [response[field]]})
                
                # Rename columns to include the field name
                flattened_field.columns = [f"{field}_{col}" for col in flattened_field.columns]
                
                # Update flattened data
                if not flattened_field.empty:
                    flattened_data.update(flattened_field.iloc[0].to_dict())
            
            except Exception as e:
                print(f"Error flattening {field}: {e}")
    
    return pd.DataFrame([flattened_data])

In [29]:
def process_json_files(input_dir, output_file):
    files = [f"{input_dir}/{file}" for file in os.listdir(input_dir) if file.endswith(".json")]
    
    dataframes = [flatten_json(file) for file in files]
    
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    combined_df.to_csv(output_file, index=False)

In [78]:
input_dir = f"{destination_base}/{year}"
output_file = f"{year}.csv"

# Process the JSON files
process_json_files(input_dir, output_file)

In [None]:
df = pd.read_csv(output_file)
print(df.shape)

(3082, 305)


  df = pd.read_csv("2019.csv")


In [None]:
filtered_df = df[df['coredata_prism:doi'].notna()]
print(filtered_df.shape)

(2799, 305)


In [81]:
conference_paper_df = filtered_df[filtered_df['coredata_subtypeDescription'].str.lower() == 'conference paper']
not_conference_paper_df = filtered_df[filtered_df['coredata_subtypeDescription'].str.lower() != 'conference paper']
print(conference_paper_df.shape)
print(not_conference_paper_df.shape)

(138, 292)
(2678, 292)


In [82]:
conference_paper_proceeding_df = conference_paper_df[conference_paper_df['coredata_prism:aggregationType'].str.lower() == 'conference proceeding']
conference_paper_not_proceeding_df = conference_paper_df[conference_paper_df['coredata_prism:aggregationType'].str.lower() != 'conference proceeding']
not_conference_paper_proceeding_df = not_conference_paper_df[not_conference_paper_df['coredata_prism:aggregationType'].str.lower() == 'conference proceeding']
not_conference_paper_not_proceeding_df = not_conference_paper_df[not_conference_paper_df['coredata_prism:aggregationType'].str.lower() != 'conference proceeding']
print(conference_paper_proceeding_df.shape)
print(conference_paper_not_proceeding_df.shape)
print(not_conference_paper_proceeding_df.shape)
print(not_conference_paper_not_proceeding_df.shape)

(107, 292)
(31, 292)
(4, 292)
(2674, 292)


In [93]:
for year in year_paths:
    df = pd.read_csv(f"{year}.csv")
    filtered_df = df[df['coredata_prism:doi'].notna()]

    doi_list = filtered_df['coredata_prism:doi'].tolist()    
    with open(f"{year}_ExistDoi.txt", 'w') as file:
        for doi in doi_list:
            file.write(f"{doi}\n")

    filtered_df.to_csv(f"{year}_filtered.csv", index=False)

  df = pd.read_csv(f"{year}.csv")
  df = pd.read_csv(f"{year}.csv")
  df = pd.read_csv(f"{year}.csv")
  df = pd.read_csv(f"{year}.csv")
  df = pd.read_csv(f"{year}.csv")
  df = pd.read_csv(f"{year}.csv")


# Extract For Unique AUID

In [1]:
import pandas as pd
import ast
import json

In [2]:
df = pd.read_csv("2018_filtered.csv")
df1 = pd.read_csv("2019_filtered.csv")
df2 = pd.read_csv("2020_filtered.csv")
df3 = pd.read_csv("2021_filtered.csv")
df4 = pd.read_csv("2022_filtered.csv")
df5 = pd.read_csv("2023_filtered.csv")

df = pd.concat([df, df1, df2, df3, df4, df5], ignore_index=True)

  df = pd.read_csv("2018_filtered.csv")
  df1 = pd.read_csv("2019_filtered.csv")
  df2 = pd.read_csv("2020_filtered.csv")
  df3 = pd.read_csv("2021_filtered.csv")
  df4 = pd.read_csv("2022_filtered.csv")
  df5 = pd.read_csv("2023_filtered.csv")


In [8]:
df.columns.to_list()

['item_ait:process-info.ait:status.@state',
 'item_ait:process-info.ait:status.@type',
 'item_ait:process-info.ait:status.@stage',
 'item_ait:process-info.ait:date-delivered.@day',
 'item_ait:process-info.ait:date-delivered.@timestamp',
 'item_ait:process-info.ait:date-delivered.@year',
 'item_ait:process-info.ait:date-delivered.@month',
 'item_ait:process-info.ait:date-sort.@day',
 'item_ait:process-info.ait:date-sort.@year',
 'item_ait:process-info.ait:date-sort.@month',
 'item_bibrecord.head.author-group',
 'item_bibrecord.head.citation-title',
 'item_bibrecord.head.abstracts',
 'item_bibrecord.head.correspondence.affiliation.country',
 'item_bibrecord.head.correspondence.affiliation.@country',
 'item_bibrecord.head.correspondence.affiliation.city',
 'item_bibrecord.head.correspondence.affiliation.organization',
 'item_bibrecord.head.correspondence.person.ce:given-name',
 'item_bibrecord.head.correspondence.person.ce:initials',
 'item_bibrecord.head.correspondence.person.ce:degrees'

In [15]:
df['authors_author'].head()

authors_str = df['authors_author'][0]
authors_list = ast.literal_eval(authors_str)

for author in authors_list:
    print(author['@auid'])

14720203700
36729660500


In [16]:
unique_auids = set()

for authors_str in df['authors_author']:
    authors_list = ast.literal_eval(authors_str)
    unique_auids.update(author['@auid'] for author in authors_list)

print(unique_auids)

{'7004647768', '57426800000', '55203914300', '35995743800', '57110088700', '57213864104', '56448183900', '55307591400', '57190949354', '35226917700', '57202752714', '56233925800', '25622931500', '35278294700', '57202775965', '17135681200', '35227389000', '58455172200', '53663248100', '35724515700', '57196369696', '15519120100', '57196087907', '55699406100', '35512093800', '36491917200', '57205247792', '6602097382', '56424384400', '55070886200', '57190983899', '57203589795', '57200209205', '57219465287', '57203666253', '6506917428', '57201474773', '8509602000', '22996761700', '56141261800', '57188998974', '57204540788', '57204277152', '36617456800', '57189264990', '54897929800', '35222026500', '57195533660', '7003602816', '7402838898', '36766757100', '36247374800', '6701780843', '55325142600', '16686836100', '57203194972', '57200559801', '56272419200', '55286336800', '55819700200', '6602158406', '57204324910', '58411652900', '57203667337', '57201551747', '55934588500', '55794330300', '3

In [21]:
df['authors_author'] = df['authors_author'].apply(ast.literal_eval)

# Extract all `@auid` values from the column
all_auids = set(auid for authors_list in df['authors_author'] for auid in [author['@auid'] for author in authors_list])

# Display the unique auid values
print(all_auids)

{'58395294900', '57554776500', '57219159683', '57211459276', '58177098400', '57219756466', '57214717287', '57222326099', '35069899200', '55307591400', '57221115737', '44061910800', '7403579470', '55909975200', '8355886600', '57202775965', '57221519528', '57221940786', '56818975700', '57209788526', '57723378300', '53663248100', '24073584900', '35724515700', '57195991104', '57196369696', '57196087907', '55699406100', '55043180700', '35512093800', '8263937200', '58366575100', '57985990700', '12779951100', '57218794371', '58031293700', '57193256081', '6701584602', '57203589795', '57200209205', '57219465287', '8509602000', '56244757300', '7004039968', '57214535465', '56026053700', '57203640779', '57292914300', '56978608700', '57201446591', '57189264990', '54897929800', '56437690000', '35222026500', '6602624567', '57311509500', '58109474000', '6507684871', '55212537400', '6701780843', '57214235808', '24435957400', '6603443460', '55883596100', '58507255100', '58365753100', '57207981344', '581

In [46]:
auid_list = [{"@auid": auid} for auid in all_auids]
print(len(auid_list))

# Write the list to a JSON file
with open("auid_list_exist_data.json", "w") as file:
    json.dump(auid_list, file, indent=4)

print("File written successfully: auid_list_exist_data.json")

72401
File written successfully: auid_list_exist_data.json


In [41]:
DIRECTORY = "JsonFile_Scrape"
YEAR = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
FILENAME = {"2013":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2014":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2015":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2016":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2017":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2018":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2018":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2019":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2020":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2021":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2022":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2023":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"]}

In [44]:
authors_set = set()

for year in YEAR:
    for filename in FILENAME[year]:
        print(f"Processing {year}_{filename}.json")
        with open(f"{DIRECTORY}/{year}/{year}_{filename}.json") as file:
            data = json.load(file)

        for paper in data:
            author = paper.get('author', [])
            if len(author) > 0:
                for a in author:
                    auid = a.get('@auid', None)
                    if auid is not None:
                        authors_set.add(auid)

print(len(authors_set))
print(authors_set)

Processing 2013_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2013_conference_paper_proceeding_fetchContentAbstract.json
Processing 2013_not_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2013_not_conference_paper_proceeding_fetchContentAbstract.json
Processing 2014_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2014_conference_paper_proceeding_fetchContentAbstract.json
Processing 2014_not_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2014_not_conference_paper_proceeding_fetchContentAbstract.json
Processing 2015_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2015_conference_paper_proceeding_fetchContentAbstract.json
Processing 2015_not_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2015_not_conference_paper_proceeding_fetchContentAbstract.json
Processing 2016_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2016_conference_paper_proce

In [50]:
result_set = all_auids | authors_set
print(len(result_set))

109883


In [51]:
all_auid_list = [{"@auid": auid} for auid in result_set]
print(len(all_auid_list))

# Write the list to a JSON file
with open("all_auid_list.json", "w") as file:
    json.dump(all_auid_list, file, indent=4)

print("File written successfully: all_auid_list.json")

109883
File written successfully: all_auid_list.json


# Extract for unique AFF-ID

In [29]:
import pandas as pd
import ast
import json

In [62]:
df = pd.read_csv("2018_filtered.csv")
df1 = pd.read_csv("2019_filtered.csv")
df2 = pd.read_csv("2020_filtered.csv")
df3 = pd.read_csv("2021_filtered.csv")
df4 = pd.read_csv("2022_filtered.csv")
df5 = pd.read_csv("2023_filtered.csv")

df = pd.concat([df, df1, df2, df3, df4, df5], ignore_index=True)

  df = pd.read_csv("2018_filtered.csv")
  df1 = pd.read_csv("2019_filtered.csv")
  df2 = pd.read_csv("2020_filtered.csv")
  df3 = pd.read_csv("2021_filtered.csv")
  df4 = pd.read_csv("2022_filtered.csv")
  df5 = pd.read_csv("2023_filtered.csv")


In [63]:
df['authors_author'] = df['authors_author'].apply(ast.literal_eval)

In [64]:
affiliation_set = set()

for authors_list in df['authors_author']:
    for author in authors_list:
        affiliation = author.get('affiliation', [])

        if type(affiliation) is list:
            for a in affiliation:
                if a.get('@id', None) is not None:
                    affiliation_set.add(a.get('@id'))

print(len(affiliation_set))

4799


In [65]:
DIRECTORY = "JsonFile_Scrape"
YEAR = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
FILENAME = {"2013":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2014":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2015":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2016":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2017":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_not_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2018":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2018":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2019":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2020":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2021":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2022":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"],
            "2023":["conference_paper_not_proceeding_fetchContentAbstract", "conference_paper_proceeding_fetchContentAbstract", "not_conference_paper_proceeding_fetchContentAbstract"]}

In [66]:
affiliation_set_new_data = set()

for year in YEAR:
    for filename in FILENAME[year]:
        print(f"Processing {year}_{filename}.json")
        with open(f"{DIRECTORY}/{year}/{year}_{filename}.json") as file:
            data = json.load(file)

        for paper in data:
            author = paper.get('author', [])
            if len(author) > 0:
                for a in author:
                    affiliation = a.get('affiliation', [])
                    if type(affiliation) is list:
                        for a in affiliation:
                            if a.get('@id', None) is not None:
                                affiliation_set_new_data.add(a.get('@id'))

print(len(affiliation_set_new_data))

Processing 2013_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2013_conference_paper_proceeding_fetchContentAbstract.json
Processing 2013_not_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2013_not_conference_paper_proceeding_fetchContentAbstract.json
Processing 2014_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2014_conference_paper_proceeding_fetchContentAbstract.json
Processing 2014_not_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2014_not_conference_paper_proceeding_fetchContentAbstract.json
Processing 2015_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2015_conference_paper_proceeding_fetchContentAbstract.json
Processing 2015_not_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2015_not_conference_paper_proceeding_fetchContentAbstract.json
Processing 2016_conference_paper_not_proceeding_fetchContentAbstract.json
Processing 2016_conference_paper_proce

In [67]:
result_aff_set = affiliation_set | affiliation_set_new_data
print(len(result_aff_set))

7625


In [68]:
all_affid_list = [{"@affid": affid} for affid in result_aff_set]

# Write the list to a JSON file
with open("all_affid_list.json", "w") as file:
    json.dump(all_affid_list, file, indent=4)

print("File written successfully: all_affid_list.json")

File written successfully: all_affid_list.json
