# Data clean up from the parquet file

In [1]:
import pandas as pd
import sys  # To exit gracefully on error

In [2]:
# --- Configuration ---
# Set the path to the Parquet file
# Update this path as needed
PARQUET_FILE_PATH = 'university_papers_data_full.parquet'

In [3]:
# --- Read the Parquet File ---
print(f"Attempting to read Parquet file: {PARQUET_FILE_PATH}")
try:
    # Use pandas read_parquet function
    # Specify the engine if needed (usually auto-detected, but 'pyarrow' is explicit)
    df = pd.read_parquet(PARQUET_FILE_PATH, engine='pyarrow')
    print("Successfully loaded Parquet file into DataFrame.")

except FileNotFoundError:
    print(f"Error: File not found at '{PARQUET_FILE_PATH}'")
    print("Please ensure the file exists and the path is correct.")
    sys.exit(1)  # Exit the script with an error code
except ImportError:
    print("Error: Missing dependency. Please install pandas and pyarrow:")
    print("pip install pandas pyarrow")
    sys.exit(1)
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")
    sys.exit(1)

Attempting to read Parquet file: university_papers_data_full.parquet
Successfully loaded Parquet file into DataFrame.


In [4]:
# --- Inspect the DataFrame ---
print("\n--- DataFrame Inspection ---")
# 1. Dimensions (Rows, Columns)
print(f"\n1. Shape (rows, columns): {df.shape}")


--- DataFrame Inspection ---

1. Shape (rows, columns): (133964, 23)


In [5]:
# 2. First 5 Rows
print("\n2. First 5 rows (head):")
# pd.set_option('display.max_columns', None) # Uncomment to show all columns if wide
print(df.head())
# pd.reset_option('display.max_columns') # Uncomment to reset display option


2. First 5 rows (head):
                        openalex_id  \
0  https://openalex.org/W2059474375   
1  https://openalex.org/W2096769811   
2  https://openalex.org/W2769188897   
3  https://openalex.org/W2012883856   
4  https://openalex.org/W2884312484   

                                                 doi language     type  \
0  https://doi.org/10.1016/j.progpolymsci.2012.04...       en  article   
1             https://doi.org/10.1002/mame.201300008       en  article   
2            https://doi.org/10.1111/1541-4337.12322       en  article   
3       https://doi.org/10.1016/j.seppur.2010.03.021       en   review   
4        https://doi.org/10.1016/j.omega.2018.07.004       en  article   

                                               title publication_date  \
0  Biocomposites reinforced with natural fibers: ...       2012-05-02   
1  Progress Report on Natural Fiber Reinforced Co...       2013-06-19   
2             Active Packaging Applications for Food       2017-11-28   
3  

In [6]:
# 3. Column Names
print("\n3. Column names:")
print(list(df.columns))


3. Column names:
['openalex_id', 'doi', 'language', 'type', 'title', 'publication_date', 'primary_location', 'open_access', 'institutions', 'authors', 'cited_by_count', 'fwci', 'citation_normalized_percentile', 'is_retracted', 'is_paratext', 'abstract', 'primary_topic', 'topics', 'keywords', 'cited_by_api_url', 'updated_date', 'created_date', 'university_key']


In [7]:
# 4. Data Types and Non-Null Counts
print("\n4. Data types and non-null info:")
# This is very useful to see if data types were inferred correctly and find missing values
df.info(verbose=True, show_counts=True)


4. Data types and non-null info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133964 entries, 0 to 133963
Data columns (total 23 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   openalex_id                     133964 non-null  object 
 1   doi                             121318 non-null  object 
 2   language                        133311 non-null  object 
 3   type                            133964 non-null  object 
 4   title                           133621 non-null  object 
 5   publication_date                133964 non-null  object 
 6   primary_location                132680 non-null  object 
 7   open_access                     133964 non-null  object 
 8   institutions                    133964 non-null  object 
 9   authors                         133964 non-null  object 
 10  cited_by_count                  133964 non-null  int64  
 11  fwci                            124437 non-n

In [8]:
# 5. Summary Statistics (for numerical columns)
print("\n5. Summary statistics (numerical columns):")
# Includes count, mean, std, min, max, percentiles
# Use .describe(include='all') to include stats for object/string columns too
print(df.describe())


5. Summary statistics (numerical columns):
       cited_by_count           fwci
count   133964.000000  124437.000000
mean        20.285942       1.755948
std         77.344626       4.471464
min          0.000000       0.000000
25%          1.000000       0.000000
50%          5.000000       0.665000
75%         19.000000       1.920000
max      10352.000000     330.051000


In [9]:

# 6. Check for Missing Values (count per column)
print("\n6. Count of missing values per column:")
missing_values = df.isnull().sum()
# Only show columns with missing values
print(missing_values[missing_values > 0])


6. Count of missing values per column:
doi                               12646
language                            653
title                               343
primary_location                   1284
fwci                               9527
citation_normalized_percentile     1671
abstract                          49592
primary_topic                      1419
dtype: int64


In [10]:
# 7. Unique Value Counts for a specific column (Example: university_key)
if 'university_key' in df.columns:
    print("\n7. Value counts for 'university_key':")
    print(df['university_key'].value_counts())
else:
    print("\n7. 'university_key' column not found.")


7. Value counts for 'university_key':
university_key
GR_UOP              49574
IT_UNISS            25083
ES_UIB              21876
PL_ZUT              13803
FR_ULHN              6989
PT_UAC               6534
FR_UAG               3790
HR_UNIDU             2660
FO_UF                1791
DE_HOCHSTRALSUND     1079
BG_BFU                642
FI_AUAS                76
SL_EMUNI               67
Name: count, dtype: int64


In [11]:
# 8. Example: Accessing data for a specific paper (e.g., the first one)
print("\n8. Data for the first paper (index 0):")
try:
    # Convert the first row to a dictionary for easier viewing
    first_paper_data = df.iloc[0].to_dict()
    # Print selected fields for brevity
    print(f"  - OpenAlex ID: {first_paper_data.get('openalex_id', 'N/A')}")
    # Preview title
    print(f"  - Title: {first_paper_data.get('title', 'N/A')[:80]}...")
    print(
        f"  - University: {first_paper_data.get('university_key', 'N/A')}")
    print(f"  - Cited By: {first_paper_data.get('cited_by_count', 'N/A')}")
    # You could print the whole dictionary if needed: print(first_paper_data)
except IndexError:
    print("  DataFrame seems empty, cannot access index 0.")


8. Data for the first paper (index 0):
  - OpenAlex ID: https://openalex.org/W2059474375
  - Title: Biocomposites reinforced with natural fibers: 2000–2010...
  - University: PL_ZUT
  - Cited By: 3697


In [12]:
# 9. Count of retracted papers
if 'is_retracted' in df.columns:
    # True is treated as 1, False as 0
    num_retracted = df['is_retracted'].sum()
    print(f"\n9. Number of retracted papers: {num_retracted}")
else:
    print("\n9. 'is_retracted' column not found.")


9. Number of retracted papers: 19


In [13]:
# 10. Count of retracted papers
if 'is_paratext' in df.columns:
    # True is treated as 1, False as 0
    num_paratext = df['is_paratext'].sum()
    print(f"\n10. Number of paratext papers: {num_paratext}")
else:
    print("\n10. 'is_paratext' column not found.")


10. Number of paratext papers: 317


In [14]:
# 11. Display one retracted paper
if 'is_retracted' in df.columns and any(df['is_retracted']):
    retracted_paper = df[df['is_retracted'] == True].iloc[0].to_dict()
    print("\n11. Example of a retracted paper:")
    print(f"  - OpenAlex ID: {retracted_paper.get('openalex_id', 'N/A')}")
    print(f"  - Title: {retracted_paper.get('title', 'N/A')[:80]}...")
    print(
        f"  - University: {retracted_paper.get('university_key', 'N/A')}")
    print(f"  - Cited By: {retracted_paper.get('cited_by_count', 'N/A')}")
else:
    print("\n11. No retracted papers found or 'is_retracted' column missing.")


11. Example of a retracted paper:
  - OpenAlex ID: https://openalex.org/W4281390396
  - Title: Toxic and repellent impacts of botanical oils against Callosobruchus maculatus (...
  - University: PL_ZUT
  - Cited By: 15


In [15]:
# 12. Display one paratext paper
if 'is_paratext' in df.columns and any(df['is_paratext']):
    paratext_paper = df[df['is_paratext'] == True].iloc[0].to_dict()
    print("\n12. Example of a paratext paper:")
    print(f"  - OpenAlex ID: {paratext_paper.get('openalex_id', 'N/A')}")
    print(f"  - University: {paratext_paper.get('university_key', 'N/A')}")
    print(f"  - Cited By: {paratext_paper.get('cited_by_count', 'N/A')}")
    print(f"  - Language: {paratext_paper.get('language', 'N/A')}")
else:
    print("\n12. No paratext papers found or 'is_paratext' column missing.")



12. Example of a paratext paper:
  - OpenAlex ID: https://openalex.org/W4399874720
  - University: PL_ZUT
  - Cited By: 0
  - Language: en


In [16]:
# 13. Count the papers without an abstract
if 'abstract' in df.columns:
    num_no_abstract = df['abstract'].isnull().sum()
    print(f"\n13. Number of papers without an abstract: {num_no_abstract}")
    num_with_abstract = df['abstract'].notnull().sum()
    print(f"14. Number of papers with an abstract: {num_with_abstract}")
    print(f"15. Percentage of papers with an abstract: {num_with_abstract / df.shape[0] * 100:.2f}%")
    print("Total number of papers (shape[0]): ", df.shape[0])
    print("Total number of papers (sum): ", num_no_abstract + num_with_abstract)


13. Number of papers without an abstract: 49592
14. Number of papers with an abstract: 84372
15. Percentage of papers with an abstract: 62.98%
Total number of papers (shape[0]):  133964
Total number of papers (sum):  133964


---