In [82]:
import json

def extract_json_objects(file_path):
    json_objects = []
    buffer = ''
    
    with open(file_path, 'r') as file:
        while True:
            chunk = file.read(8192)  # Read in 8KB chunks
            if not chunk:
                break
            buffer += chunk
            
            while True:
                try:
                    # Try to load a JSON object from the buffer
                    json_obj, end_index = json.JSONDecoder().raw_decode(buffer)
                    json_objects.append(json_obj)
                    buffer = buffer[end_index:].strip()  # Remove processed JSON and whitespace
                except json.JSONDecodeError:
                    break

    return json_objects

# Extract JSON objects from the original file
extracted_data = extract_json_objects('altmetric_data.json')

# Check the number of extracted objects and display a few samples
print(f"Total records extracted: {len(extracted_data)}")
for i, record in enumerate(extracted_data[:3]):  # Show a few sample records
    print(f"\nRecord {i+1}:\n{json.dumps(record, indent=4)}\n")


Total records extracted: 7963

Record 1:
"{\"title\":\"Isolation and Characterization of the Saturate and Aromatic Fractions of a Maya Crude Oil\",\"doi\":\"10.1021/ef990207h\",\"isbns\":[],\"altmetric_jid\":\"4f6fa56d3cf058f610004b77\",\"issns\":[\"0887-0624\",\"1520-5029\"],\"journal\":\"Energy & Fuels\",\"cohorts\":[],\"context\":{\"all\":{\"count\":23053169,\"mean\":10.252445880318438,\"rank\":4764956,\"pct\":76,\"higher_than\":17589224},\"journal\":{\"count\":3344,\"mean\":3.7311549043062175,\"rank\":358,\"pct\":80,\"higher_than\":2706},\"similar_age_3m\":{\"count\":38908,\"mean\":5.955448699496252,\"rank\":5941,\"pct\":72,\"higher_than\":28141},\"similar_age_journal_3m\":{\"count\":3,\"mean\":4,\"rank\":1,\"pct\":66,\"higher_than\":2}},\"authors\":[\"Walter E. Rudzinski\",\"Tejraj M. Aminabhavi\",\"Steve Sassman\",\"Linette M. Watkins\"],\"type\":\"article\",\"handles\":[],\"pubdate\":962409600,\"epubdate\":961200000,\"dimensions_publication_id\":\"pub.1055483631\",\"altmetric_id

In [96]:
import json
import pandas as pd

# Comprehensive list of attributes to extract, including missing ones
attributes = [
    'title', 'doi', 'altmetric_jid', 'issns', 'journal', 'authors', 'pubdate',
    'epubdate', 'altmetric_id', 'schema', 'is_oa', 'publisher_subjects',
    'cited_by_patents_count', 'cited_by_posts_count', 'cited_by_accounts_count',
    'score', 'url', 'published_on', 'readers_count', 'dimensions_publication_id',
    'added_on', 'last_updated', 'type', 'details_url', 'handles', 'scopus_subjects',
    'abstract', 'pmid', 'cited_by_policies_count', 'cited_by_msm_count', 'isbns', 'cohorts'
]

# Additional nested attributes to extract and flatten, including images and subjects
context_keys = [
    'all', 'journal', 'similar_age_3m', 'similar_age_journal_3m'
]

history_keys = [
    '1y', '6m', '3m', '1m', '1w', '6d', '5d', '4d', '3d', '2d', '1d', 'at'
]

readers_keys = [
    'citeulike', 'mendeley', 'connotea'
]

image_sizes = ['small', 'medium', 'large']

# List to store the extracted data
structured_data = []

# Loop through the records
for record in extracted_data:
    raw_record = json.loads(record)
    structured_record = {}

    # Extract relevant attributes
    for attr in attributes:
        if attr in raw_record:
            if isinstance(raw_record[attr], list):
                if attr in ['issns', 'authors', 'handles', 'cohorts', 'isbns']:
                    structured_record[attr] = ', '.join(raw_record[attr])
                elif attr == 'publisher_subjects':
                    subjects = [subject['name'] for subject in raw_record[attr] if 'name' in subject]
                    structured_record[attr] = ', '.join(subjects)
                elif attr == 'scopus_subjects' or attr == 'subjects':
                    structured_record[attr] = ', '.join(raw_record[attr])
            else:
                structured_record[attr] = raw_record[attr]
        else:
            structured_record[attr] = None

    # Extract nested context data and only include values
    if 'context' in raw_record:
        for key in context_keys:
            if key in raw_record['context']:
                context_data = raw_record['context'][key]
                structured_record[f'context_{key}_count'] = context_data.get("count", None)
                structured_record[f'context_{key}_mean'] = context_data.get("mean", None)
                structured_record[f'context_{key}_rank'] = context_data.get("rank", None)
                structured_record[f'context_{key}_pct'] = context_data.get("pct", None)
                structured_record[f'context_{key}_higher_than'] = context_data.get("higher_than", None)
            else:
                structured_record[f'context_{key}_count'] = None
                structured_record[f'context_{key}_mean'] = None
                structured_record[f'context_{key}_rank'] = None
                structured_record[f'context_{key}_pct'] = None
                structured_record[f'context_{key}_higher_than'] = None

    # Extract nested history data and only include values
    if 'history' in raw_record:
        for key in history_keys:
            structured_record[f'history_{key}'] = raw_record["history"].get(key, None)
    
    # Extract nested readers data and only include values
    if 'readers' in raw_record:
        for key in readers_keys:
            structured_record[f'readers_{key}'] = raw_record["readers"].get(key, None)
    
    # Extract image data and only include values
    if 'images' in raw_record:
        for size in image_sizes:
            structured_record[f'image_{size}'] = raw_record["images"].get(size, None)
    
    # Ensure abstract is captured
    structured_record['abstract'] = raw_record.get('abstract', None)
    
    # Extract subjects data
    structured_record['subjects'] = ', '.join(raw_record.get('subjects', []))

    structured_data.append(structured_record)

# Create DataFrame
altmetric_df = pd.DataFrame(structured_data)

# Display the first few rows of the DataFrame to ensure correctness
print(altmetric_df.head())

# Optionally, save the DataFrame to a CSV file
altmetric_df.to_csv('altmetric_structured_data.csv', index=False)


                                               title  \
0  Isolation and Characterization of the Saturate...   
1  A Review on Extraction and Identification of C...   
2  Antimicrobial Residue Detection in Chicken Yol...   
3  Effect of acute postexercise ethanol intoxicat...   
4  Electrical and magnetic properties of La0.5Sr0...   

                           doi             altmetric_jid  \
0            10.1021/ef990207h  4f6fa56d3cf058f610004b77   
1            10.1021/ef990177i  4f6fa56d3cf058f610004b77   
2            10.1021/jf000140s  4f6fa6053cf058f610006625   
3  10.1152/jappl.2000.88.1.165  4f6fa61f3cf058f610007b71   
4             10.1063/1.126208  4f6fa4e83cf058f6100022e1   

                  issns                                   journal  \
0  0887-0624, 1520-5029                            Energy & Fuels   
1  0887-0624, 1520-5029                            Energy & Fuels   
2  0021-8561, 1520-5118  Journal of Agricultural & Food Chemistry   
3  8750-7587, 1522-1601   

In [98]:
import pandas as pd

# Load the CSV file
csv_file_path = 'altmetric_structured_data.csv'  # Replace with your actual file path
df = pd.read_csv(csv_file_path)

# Print all column names
print("Column names in the CSV file:")
for column in df.columns:
    print(column)


Column names in the CSV file:
title
doi
altmetric_jid
issns
journal
authors
pubdate
epubdate
altmetric_id
schema
is_oa
publisher_subjects
cited_by_patents_count
cited_by_posts_count
cited_by_accounts_count
score
url
published_on
readers_count
dimensions_publication_id
added_on
last_updated
type
details_url
handles
scopus_subjects
abstract
pmid
cited_by_policies_count
cited_by_msm_count
isbns
cohorts
context_all_count
context_all_mean
context_all_rank
context_all_pct
context_all_higher_than
context_journal_count
context_journal_mean
context_journal_rank
context_journal_pct
context_journal_higher_than
context_similar_age_3m_count
context_similar_age_3m_mean
context_similar_age_3m_rank
context_similar_age_3m_pct
context_similar_age_3m_higher_than
context_similar_age_journal_3m_count
context_similar_age_journal_3m_mean
context_similar_age_journal_3m_rank
context_similar_age_journal_3m_pct
context_similar_age_journal_3m_higher_than
history_1y
history_6m
history_3m
history_1m
history_1w
hist

In [100]:
import json

# Load the JSON objects line by line (assuming the JSON records are already extracted into `extracted_data`)
empty_isbn_count = 0
non_empty_isbn_count = 0
total_records = len(extracted_data)

for record in extracted_data:
    raw_record = json.loads(record)
    if 'isbns' in raw_record:
        if not raw_record['isbns']:  # Check if the ISBN list is empty
            empty_isbn_count += 1
        else:
            non_empty_isbn_count += 1
    else:
        empty_isbn_count += 1  # Consider records without the 'isbns' key as having an empty ISBN

# Output the results
print(f"Total records: {total_records}")
print(f"Records with empty ISBNs: {empty_isbn_count}")
print(f"Records with non-empty ISBNs: {non_empty_isbn_count}")

# Optionally, you can print some sample records with and without ISBNs to inspect them
print("\nSample records with non-empty ISBNs:")
for record in extracted_data[:3]:
    raw_record = json.loads(record)
    if raw_record.get('isbns'):
        print(json.dumps(raw_record['isbns'], indent=4))

print("\nSample records with empty ISBNs:")
for record in extracted_data[:3]:
    raw_record = json.loads(record)
    if not raw_record.get('isbns'):
        print(json.dumps(raw_record, indent=4))


Total records: 7963
Records with empty ISBNs: 7943
Records with non-empty ISBNs: 20

Sample records with non-empty ISBNs:

Sample records with empty ISBNs:
{
    "title": "Isolation and Characterization of the Saturate and Aromatic Fractions of a Maya Crude Oil",
    "doi": "10.1021/ef990207h",
    "isbns": [],
    "altmetric_jid": "4f6fa56d3cf058f610004b77",
    "issns": [
        "0887-0624",
        "1520-5029"
    ],
    "journal": "Energy & Fuels",
    "cohorts": [],
    "context": {
        "all": {
            "count": 23053169,
            "mean": 10.252445880318438,
            "rank": 4764956,
            "pct": 76,
            "higher_than": 17589224
        },
        "journal": {
            "count": 3344,
            "mean": 3.7311549043062175,
            "rank": 358,
            "pct": 80,
            "higher_than": 2706
        },
        "similar_age_3m": {
            "count": 38908,
            "mean": 5.955448699496252,
            "rank": 5941,
            "pct": 

In [33]:
import json

# Load the PlumX data
plumx_file_path = 'plumx_metrics_data.json'

with open(plumx_file_path, 'r') as file:
    plumx_data = json.load(file)

# Print out the type of data structure
print(f"Data type: {type(plumx_data)}")

# Print the total number of records
print(f"Total records: {len(plumx_data)}")

# Print a few sample records to inspect the structure
for i, record in enumerate(plumx_data[:3]):  # Adjust the range as needed to view more records
    print(f"\nRecord {i+1}:\n{json.dumps(record, indent=4)}\n")


Data type: <class 'list'>
Total records: 21769

Record 1:
{
    "doi": "10.1116/1.591356",
    "capture": {
        "READER_COUNT": 7
    },
    "citation": {
        "Scopus": 6,
        "CrossRef": 6
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}


Record 2:
{
    "doi": "10.2752/089279300786999950",
    "capture": {
        "READER_COUNT": 118
    },
    "citation": {
        "Scopus": 82,
        "CrossRef": 70,
        "Academic Citation Index (ACI) - airiti": 2
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}


Record 3:
{
    "doi": "10.1021/ic000389r",
    "capture": {
        "READER_COUNT": 11
    },
    "citation": {
        "Scopus": 40,
        "CrossRef": 31,
        "PubMed": 1
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}



In [35]:
import json

# Load the original PlumX data
with open('plumx_metrics_data.json', 'r') as file:
    plumx_data = json.load(file)

# Define the DOIs to check (based on the output you provided)
dois_to_check = [
    '10.1116/1.591356',
    '10.2752/089279300786999950',
    '10.1021/ic000389r',
    '10.1021/ef990207h',
    '10.1016/s0362-3319(00)00063-x'
]

# Check the original data for these DOIs
for doi in dois_to_check:
    for record in plumx_data:
        if record['doi'] == doi:
            print(f"Record for DOI: {doi}")
            print(json.dumps(record, indent=4))
            print("-" * 80)
            break


Record for DOI: 10.1116/1.591356
{
    "doi": "10.1116/1.591356",
    "capture": {
        "READER_COUNT": 7
    },
    "citation": {
        "Scopus": 6,
        "CrossRef": 6
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}
--------------------------------------------------------------------------------
Record for DOI: 10.2752/089279300786999950
{
    "doi": "10.2752/089279300786999950",
    "capture": {
        "READER_COUNT": 118
    },
    "citation": {
        "Scopus": 82,
        "CrossRef": 70,
        "Academic Citation Index (ACI) - airiti": 2
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}
--------------------------------------------------------------------------------
Record for DOI: 10.1021/ic000389r
{
    "doi": "10.1021/ic000389r",
    "capture": {
        "READER_COUNT": 11
    },
    "citation": {
        "Scopus": 40,
        "CrossRef": 31,
        "PubMed": 1
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}


In [102]:
import pandas as pd

# Extract attributes from PlumX data and handle possible missing attributes
def process_plumx_data(plumx_data):
    processed_data = []

    for record in plumx_data:
        processed_record = {}
        
        # Basic attribute
        processed_record['doi'] = record.get('doi', None)
        
        # Capture metrics
        capture = record.get('capture', {})
        processed_record['reader_count'] = capture.get('READER_COUNT', None)
        
        # Citation metrics
        citation = record.get('citation', {})
        processed_record['scopus_citations'] = citation.get('Scopus', None)
        processed_record['crossref_citations'] = citation.get('CrossRef', None)
        processed_record['pubmed_citations'] = citation.get('PubMed', None)
        processed_record['aci_citations'] = citation.get('Academic Citation Index (ACI) - airiti', None)
        processed_record['policy_citations'] = citation.get('Policy Citation', None)
        
        # Mention metrics (currently empty in your sample)
        processed_record['mention_count'] = None
        
        # Social media metrics (currently empty in your sample)
        processed_record['social_media_count'] = None
        
        # Usage metrics (currently empty in your sample)
        processed_record['usage_count'] = None
        
        processed_data.append(processed_record)
    
    return pd.DataFrame(processed_data)

# Example usage
with open('plumx_metrics_data.json', 'r') as file:
    plumx_data = json.load(file)

# Process the PlumX data
plumx_df = process_plumx_data(plumx_data)

# Display the first few rows of the DataFrame
print(plumx_df.head())

# Optionally, save the DataFrame to a CSV file
plumx_df.to_csv('processed_plumx_data.csv', index=False)


                             doi  reader_count  scopus_citations  \
0               10.1116/1.591356           7.0               6.0   
1     10.2752/089279300786999950         118.0              82.0   
2              10.1021/ic000389r          11.0              40.0   
3              10.1021/ef990207h          39.0              40.0   
4  10.1016/s0362-3319(00)00063-x          38.0              49.0   

   crossref_citations  pubmed_citations  aci_citations  policy_citations  \
0                 6.0               NaN            NaN               NaN   
1                70.0               NaN            2.0               NaN   
2                31.0               1.0            NaN               NaN   
3                28.0               NaN            NaN               NaN   
4                33.0               NaN            NaN               1.0   

  mention_count social_media_count usage_count  
0          None               None        None  
1          None               None  