In [54]:
import json

# Function to load and explore potentially problematic JSON files
def explore_json(file_path, num_records=5):
    try:
        with open(file_path, 'r') as file:
            raw_data = file.read()
            # Attempt to parse the whole file if possible
            try:
                data = json.loads(raw_data)
            except json.JSONDecodeError:
                # If it fails, try line by line
                data = []
                for line in raw_data.splitlines():
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON on line: {e}")
                        continue

        # Check the type of the first element in data
        if isinstance(data, list):
            print(f"Total Records Loaded: {len(data)}")
            print(f"Data type of first element: {type(data[0])}")
        elif isinstance(data, dict):
            print(f"Keys: {list(data.keys())}")
            data = [data]

        # Print a sample of the data
        print("\nSample Data:")
        for i, record in enumerate(data[:num_records]):
            print(f"\nRecord {i+1}:")
            print(json.dumps(record, indent=4))
    
    except Exception as e:
        print(f"An error occurred: {e}")

# Paths to your JSON files
altmetric_file_path = 'altmetric_data.json'
plumx_file_path = 'plumx_metrics_data.json'

# Explore Altmetric Data
print("Exploring Altmetric Data:")
explore_json(altmetric_file_path)

# Explore PlumX Data
print("\n\nExploring PlumX Data:")
explore_json(plumx_file_path)


Exploring Altmetric Data:
Error decoding JSON on line: Extra data: line 1 column 1974 (char 1973)
Total Records Loaded: 0
An error occurred: list index out of range


Exploring PlumX Data:
Total Records Loaded: 21769
Data type of first element: <class 'dict'>

Sample Data:

Record 1:
{
    "doi": "10.1116/1.591356",
    "capture": {
        "READER_COUNT": 7
    },
    "citation": {
        "Scopus": 6,
        "CrossRef": 6
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}

Record 2:
{
    "doi": "10.2752/089279300786999950",
    "capture": {
        "READER_COUNT": 118
    },
    "citation": {
        "Scopus": 82,
        "CrossRef": 70,
        "Academic Citation Index (ACI) - airiti": 2
    },
    "mention": {},
    "social_media": {},
    "usage": {}
}

Record 3:
{
    "doi": "10.1021/ic000389r",
    "capture": {
        "READER_COUNT": 11
    },
    "citation": {
        "Scopus": 40,
        "CrossRef": 31,
        "PubMed": 1
    },
    "mention": {},
    "so

In [56]:
import json

def fix_large_json(file_path, output_path):
    with open(file_path, 'r') as file:
        buffer = ''
        valid_json_objects = []

        while True:
            chunk = file.read(8192)  # Read in 8KB chunks
            if not chunk:
                break
            buffer += chunk
            while True:
                try:
                    # Try to load a JSON object from the buffer
                    json_obj, end_index = json.JSONDecoder().raw_decode(buffer)
                    valid_json_objects.append(json_obj)
                    buffer = buffer[end_index:].lstrip()  # Remove processed JSON and whitespace
                except json.JSONDecodeError:
                    # Break if no valid JSON object is found yet
                    break

    # Write the valid JSON objects to a new file
    with open(output_path, 'w') as outfile:
        json.dump(valid_json_objects, outfile, indent=4)
    print(f"Formatted JSON saved to {output_path}")

# Paths to your files
altmetric_file_path = 'altmetric_data.json'
fixed_output_path = 'altmetric_data_fixed.json'

# Run the fixing process
fix_large_json(altmetric_file_path, fixed_output_path)


Formatted JSON saved to altmetric_data_fixed.json


In [58]:
import json

# Load the fixed Altmetric data
with open('altmetric_data_fixed.json', 'r') as file:
    altmetric_data = json.load(file)

# Print out the type of data structure
print(f"Data type: {type(altmetric_data)}")

# If it's a list, print the number of records and a sample
if isinstance(altmetric_data, list):
    print(f"Total records: {len(altmetric_data)}")
    print("\nSample Record:")
    print(json.dumps(altmetric_data[0], indent=4))

# If it's a dictionary, print the keys and a sample
elif isinstance(altmetric_data, dict):
    print(f"Keys: {list(altmetric_data.keys())}")
    print("\nSample Data:")
    print(json.dumps(altmetric_data, indent=4))

# If the structure is unexpected, print the first 500 characters to investigate
else:
    print(f"Unexpected data structure. First 500 characters:\n{str(altmetric_data)[:500]}")


Data type: <class 'list'>
Total records: 7963

Sample Record:
"{\"title\":\"Isolation and Characterization of the Saturate and Aromatic Fractions of a Maya Crude Oil\",\"doi\":\"10.1021/ef990207h\",\"isbns\":[],\"altmetric_jid\":\"4f6fa56d3cf058f610004b77\",\"issns\":[\"0887-0624\",\"1520-5029\"],\"journal\":\"Energy & Fuels\",\"cohorts\":[],\"context\":{\"all\":{\"count\":23053169,\"mean\":10.252445880318438,\"rank\":4764956,\"pct\":76,\"higher_than\":17589224},\"journal\":{\"count\":3344,\"mean\":3.7311549043062175,\"rank\":358,\"pct\":80,\"higher_than\":2706},\"similar_age_3m\":{\"count\":38908,\"mean\":5.955448699496252,\"rank\":5941,\"pct\":72,\"higher_than\":28141},\"similar_age_journal_3m\":{\"count\":3,\"mean\":4,\"rank\":1,\"pct\":66,\"higher_than\":2}},\"authors\":[\"Walter E. Rudzinski\",\"Tejraj M. Aminabhavi\",\"Steve Sassman\",\"Linette M. Watkins\"],\"type\":\"article\",\"handles\":[],\"pubdate\":962409600,\"epubdate\":961200000,\"dimensions_publication_id\":\"pub.105548

In [60]:
import json
import pandas as pd

# Load the fixed Altmetric data
with open('altmetric_data_fixed.json', 'r') as file:
    altmetric_data = json.load(file)

# Decode each JSON string into a dictionary
decoded_data = []
for record in altmetric_data:
    decoded_record = json.loads(record)  # Decode the JSON-encoded string
    decoded_data.append(decoded_record)

# Display a sample of the decoded data
print(f"Decoded Data Sample:\n{json.dumps(decoded_data[0], indent=4)}")

# Convert the decoded data into a DataFrame
altmetric_df = pd.DataFrame(decoded_data)

# Display the first few rows of the DataFrame
print(altmetric_df.head())

# Optionally, save the DataFrame to a CSV file for further use
altmetric_df.to_csv('altmetric_data.csv', index=False)


Decoded Data Sample:
{
    "title": "Isolation and Characterization of the Saturate and Aromatic Fractions of a Maya Crude Oil",
    "doi": "10.1021/ef990207h",
    "isbns": [],
    "altmetric_jid": "4f6fa56d3cf058f610004b77",
    "issns": [
        "0887-0624",
        "1520-5029"
    ],
    "journal": "Energy & Fuels",
    "cohorts": [],
    "context": {
        "all": {
            "count": 23053169,
            "mean": 10.252445880318438,
            "rank": 4764956,
            "pct": 76,
            "higher_than": 17589224
        },
        "journal": {
            "count": 3344,
            "mean": 3.7311549043062175,
            "rank": 358,
            "pct": 80,
            "higher_than": 2706
        },
        "similar_age_3m": {
            "count": 38908,
            "mean": 5.955448699496252,
            "rank": 5941,
            "pct": 72,
            "higher_than": 28141
        },
        "similar_age_journal_3m": {
            "count": 3,
            "mean": 4,
    

In [62]:
import json
import pandas as pd

# Load the PlumX JSON data
with open('plumx_metrics_data.json', 'r') as file:
    plumx_data = json.load(file)

# Convert PlumX data to DataFrame
plumx_df = pd.json_normalize(plumx_data)

# Save to CSV for future use
plumx_df.to_csv('plumx_metrics_data.csv', index=False)


In [64]:
import pandas as pd

# Load the CSVs
altmetric_df = pd.read_csv('altmetric_data.csv')
plumx_df = pd.read_csv('plumx_metrics_data.csv')

# Ensure the DOI fields are consistent
altmetric_df['doi'] = altmetric_df['doi'].str.lower().str.strip()
plumx_df['doi'] = plumx_df['doi'].str.lower().str.strip()

# Merge the two DataFrames on the DOI field
merged_df = pd.merge(altmetric_df, plumx_df, on='doi', how='outer', suffixes=('_altmetric', '_plumx'))

# Display the first few rows of the merged DataFrame
print(merged_df.head())

# Optionally, save the merged DataFrame to a CSV file
merged_df.to_csv('merged_altmetric_plumx_data.csv', index=False)


                                               title                      doi  \
0  The Link Between Dietary Sugar Intake and Card...  10.1001/jama.2014.18267   
1                                                NaN    10.1001/jama.2014.389   
2  Ensuring Competency and Professionalism Throug...   10.1001/jama.2015.4017   
3         Added Sugar Intake and Public Health—Reply   10.1001/jama.2015.6227   
4  Expectations for Physicians Prescribing Mariju...  10.1001/jama.2016.16109   

  isbns             altmetric_jid                       issns  \
0   NaN  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   
1   NaN                       NaN                         NaN   
2    []  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   
3    []  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   
4    []  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   

                                             journal  \
0  JAMA: Journal of the American Medical Association   
1         

In [78]:
import pandas as pd

# Attempt to read the file as a CSV
file_name = 'merged_altmetric_plumx_data.csv'
merged_df = pd.read_csv(file_name, delimiter=',', on_bad_lines='skip', quoting=0)

# Display the first few rows to inspect the structure
merged_df.head()


  merged_df = pd.read_csv(file_name, delimiter=',', on_bad_lines='skip', quoting=0)


Unnamed: 0,title,doi,isbns,altmetric_jid,issns,journal,cohorts,context,authors,type,...,mention.REFERENCE_COUNT,citation.SSRN,mention.ABSTRACT_VIEWS,mention.DOWNLOAD_COUNT,mention.ALL_BLOG_COUNT,citation.PubMed Guidelines,mention.FULL_TEXT_VIEWS,citation.SciELO,capture.EXPORTS_SAVES,citation.CSCD
0,The Link Between Dietary Sugar Intake and Card...,10.1001/jama.2014.18267,,4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'pub': 121, 'doc': 46, 'sci': 14, 'com': 3}","{'all': {'count': 26368346, 'mean': 10.8933511...","['Nikhil V. Dhurandhar', 'Diana Thomas']",article,...,,,,,,,,,,
1,,10.1001/jama.2014.389,,,,,,,,,...,,,,,,,,,,
2,Ensuring Competency and Professionalism Throug...,10.1001/jama.2015.4017,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'pub': 19, 'com': 2, 'doc': 16, 'sci': 3}","{'all': {'count': 25373627, 'mean': 10.7452937...","['Humayun J. Chaudhry', 'J. Daniel Gifford', '...",article,...,,,,,,,,,,
3,Added Sugar Intake and Public Health—Reply,10.1001/jama.2015.6227,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'com': 1, 'pub': 10, 'doc': 4, 'sci': 2}","{'all': {'count': 26386754, 'mean': 10.8978605...","['Nikhil V. Dhurandhar', 'Diana Thomas']",article,...,,,,,,,,,,
4,Expectations for Physicians Prescribing Mariju...,10.1001/jama.2016.16109,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,{'pub': 3},"{'all': {'count': 25374647, 'mean': 10.7456918...",['Humayun J. Chaudhry'],article,...,,,,,,,,,,


In [80]:
# Check for missing data
missing_data_summary = merged_df.isnull().sum()

# Display the summary of missing data
print("Missing Data Summary:")
print(missing_data_summary[missing_data_summary > 0])

# Fill remaining missing values for essential columns (instead of dropping columns)
merged_df_filled = merged_df.fillna('Unknown')

# Display the first few rows after filling missing data
merged_df_filled.head()


Missing Data Summary:
title                         14102
isbns                         15363
altmetric_jid                 14267
issns                         14154
journal                       14219
                              ...  
citation.PubMed Guidelines    21846
mention.FULL_TEXT_VIEWS       22036
citation.SciELO               22062
capture.EXPORTS_SAVES         22059
citation.CSCD                 22064
Length: 80, dtype: int64


Unnamed: 0,title,doi,isbns,altmetric_jid,issns,journal,cohorts,context,authors,type,...,mention.REFERENCE_COUNT,citation.SSRN,mention.ABSTRACT_VIEWS,mention.DOWNLOAD_COUNT,mention.ALL_BLOG_COUNT,citation.PubMed Guidelines,mention.FULL_TEXT_VIEWS,citation.SciELO,capture.EXPORTS_SAVES,citation.CSCD
0,The Link Between Dietary Sugar Intake and Card...,10.1001/jama.2014.18267,Unknown,4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'pub': 121, 'doc': 46, 'sci': 14, 'com': 3}","{'all': {'count': 26368346, 'mean': 10.8933511...","['Nikhil V. Dhurandhar', 'Diana Thomas']",article,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,Unknown,10.1001/jama.2014.389,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,Ensuring Competency and Professionalism Throug...,10.1001/jama.2015.4017,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'pub': 19, 'com': 2, 'doc': 16, 'sci': 3}","{'all': {'count': 25373627, 'mean': 10.7452937...","['Humayun J. Chaudhry', 'J. Daniel Gifford', '...",article,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,Added Sugar Intake and Public Health—Reply,10.1001/jama.2015.6227,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'com': 1, 'pub': 10, 'doc': 4, 'sci': 2}","{'all': {'count': 26386754, 'mean': 10.8978605...","['Nikhil V. Dhurandhar', 'Diana Thomas']",article,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,Expectations for Physicians Prescribing Mariju...,10.1001/jama.2016.16109,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,{'pub': 3},"{'all': {'count': 25374647, 'mean': 10.7456918...",['Humayun J. Chaudhry'],article,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


In [88]:
import pandas as pd

# Normalize the 'cohorts' column
cohorts_normalized = pd.json_normalize(merged_df_filled['cohorts'])

# Drop the original 'cohorts' column and join the normalized data
merged_df_filled = merged_df_filled.drop(columns=['cohorts']).join(cohorts_normalized)

# Display the first few rows after normalization
print(merged_df_filled.head())


                                               title                      doi  \
0  The Link Between Dietary Sugar Intake and Card...  10.1001/jama.2014.18267   
1                                            Unknown    10.1001/jama.2014.389   
2  Ensuring Competency and Professionalism Throug...   10.1001/jama.2015.4017   
3         Added Sugar Intake and Public Health—Reply   10.1001/jama.2015.6227   
4  Expectations for Physicians Prescribing Mariju...  10.1001/jama.2016.16109   

     isbns             altmetric_jid                       issns  \
0  Unknown  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   
1  Unknown                   Unknown                     Unknown   
2       []  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   
3       []  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   
4       []  4f6fa4ee3cf058f610002c38  ['0098-7484', '1538-3598']   

                                             journal  \
0  JAMA: Journal of the American Medical Associa

In [90]:
# Save the cleaned DataFrame to a new CSV file
merged_df_filled.to_csv('cleaned_merged_data_final_v2.csv', index=False)


In [92]:
# Save the cleaned DataFrame to a new CSV file
merged_df_filled.head()

Unnamed: 0,title,doi,isbns,altmetric_jid,issns,journal,context,authors,type,handles,...,mention.REFERENCE_COUNT,citation.SSRN,mention.ABSTRACT_VIEWS,mention.DOWNLOAD_COUNT,mention.ALL_BLOG_COUNT,citation.PubMed Guidelines,mention.FULL_TEXT_VIEWS,citation.SciELO,capture.EXPORTS_SAVES,citation.CSCD
0,The Link Between Dietary Sugar Intake and Card...,10.1001/jama.2014.18267,Unknown,4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'all': {'count': 26368346, 'mean': 10.8933511...","['Nikhil V. Dhurandhar', 'Diana Thomas']",article,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,Unknown,10.1001/jama.2014.389,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,Ensuring Competency and Professionalism Throug...,10.1001/jama.2015.4017,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'all': {'count': 25373627, 'mean': 10.7452937...","['Humayun J. Chaudhry', 'J. Daniel Gifford', '...",article,[],...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,Added Sugar Intake and Public Health—Reply,10.1001/jama.2015.6227,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'all': {'count': 26386754, 'mean': 10.8978605...","['Nikhil V. Dhurandhar', 'Diana Thomas']",article,[],...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,Expectations for Physicians Prescribing Mariju...,10.1001/jama.2016.16109,[],4f6fa4ee3cf058f610002c38,"['0098-7484', '1538-3598']",JAMA: Journal of the American Medical Association,"{'all': {'count': 25374647, 'mean': 10.7456918...",['Humayun J. Chaudhry'],article,[],...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
