# Data clean up from the parquet file

In [16]:
import pandas as pd
import sys  # To exit gracefully on error

In [17]:
# --- Configuration ---
# Set the path to the Parquet file
# Update this path as needed
PARQUET_FILE_PATH = 'authors_raw_data.parquet'

In [18]:
# --- Read the Parquet File ---
print(f"Attempting to read Parquet file: {PARQUET_FILE_PATH}")
try:
    # Use pandas read_parquet function
    # Specify the engine if needed (usually auto-detected, but 'pyarrow' is explicit)
    df = pd.read_parquet(PARQUET_FILE_PATH, engine='pyarrow')
    print("Successfully loaded Parquet file into DataFrame.")

except FileNotFoundError:
    print(f"Error: File not found at '{PARQUET_FILE_PATH}'")
    print("Please ensure the file exists and the path is correct.")
    sys.exit(1)  # Exit the script with an error code
except ImportError:
    print("Error: Missing dependency. Please install pandas and pyarrow:")
    print("pip install pandas pyarrow")
    sys.exit(1)
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")
    sys.exit(1)

Attempting to read Parquet file: authors_raw_data.parquet
Successfully loaded Parquet file into DataFrame.


In [19]:
# --- Inspect the DataFrame ---
print("\n--- DataFrame Inspection ---")
# 1. Dimensions (Rows, Columns)
print(f"\n1. Shape (rows, columns): {df.shape}")


--- DataFrame Inspection ---

1. Shape (rows, columns): (44381, 16)


In [20]:
# 2. First 5 Rows
print("\n2. First 5 rows (head):")
# pd.set_option('display.max_columns', None) # Uncomment to show all columns if wide
print(df.head())
# pd.reset_option('display.max_columns') # Uncomment to reset display option


2. First 5 rows (head):
                                 id  \
0  https://openalex.org/A5010715704   
1  https://openalex.org/A5060085599   
2  https://openalex.org/A5094077737   
3  https://openalex.org/A5108452435   
4  https://openalex.org/A5077470111   

                                                 ids       display_name  \
0  {'openalex': 'https://openalex.org/A5010715704...  Оleg G. Sinyashin   
1  {'openalex': 'https://openalex.org/A5060085599...      А. Р. Бурилов   
2  {'openalex': 'https://openalex.org/A5094077737...         Klaus Gräf   
3  {'openalex': 'https://openalex.org/A5108452435...        J Zieliński   
4  {'openalex': 'https://openalex.org/A5077470111...           N. Wrage   

                           display_name_alternatives  \
0  [O. G. Sinyashin, Oleg Gerol'dovich Sinyashin,...   
1  [A. R. Burilov, Аlexander R. Burilov, А. Р. Бу...   
2  [G. Gogola ́k, Klaus Gräf, Graf K, K. Graf, Kl...   
3  [Jan Zieliński, J. Zieliński, J Zielínski, J Z...   
4  [N. Wr

In [21]:
# 3. Column Names
print("\n3. Column names:")
print(list(df.columns))


3. Column names:
['id', 'ids', 'display_name', 'display_name_alternatives', 'affiliations', 'cited_by_count', 'last_known_institutions', 'orcid', 'summary_stats', 'works_count', 'counts_by_year', 'concepts', 'created_date', 'updated_date', 'x_concepts', 'university_key']


In [22]:
# 4. Data Types and Non-Null Counts
print("\n4. Data types and non-null info:")
# This is very useful to see if data types were inferred correctly and find missing values
df.info(verbose=True, show_counts=True)


4. Data types and non-null info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44381 entries, 0 to 44380
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   id                         44381 non-null  object
 1   ids                        44381 non-null  object
 2   display_name               44381 non-null  object
 3   display_name_alternatives  44381 non-null  object
 4   affiliations               44381 non-null  object
 5   cited_by_count             44381 non-null  int64 
 6   last_known_institutions    44381 non-null  object
 7   orcid                      15185 non-null  object
 8   summary_stats              44381 non-null  object
 9   works_count                44381 non-null  int64 
 10  counts_by_year             44381 non-null  object
 11  concepts                   44381 non-null  object
 12  created_date               44381 non-null  object
 13  updated_date               

In [23]:
# 5. Summary Statistics (for numerical columns)
print("\n5. Summary statistics (numerical columns):")
# Includes count, mean, std, min, max, percentiles
# Use .describe(include='all') to include stats for object/string columns too
print(df.describe())


5. Summary statistics (numerical columns):
       cited_by_count   works_count
count    44381.000000  44381.000000
mean       734.276627     35.071337
std       2943.364003     80.553636
min          0.000000      0.000000
25%          6.000000      2.000000
50%         56.000000      8.000000
75%        399.000000     35.000000
max     126565.000000   3113.000000


In [24]:

# 6. Check for Missing Values (count per column)
print("\n6. Count of missing values per column:")
missing_values = df.isnull().sum()
# Only show columns with missing values
print(missing_values[missing_values > 0])


6. Count of missing values per column:
orcid    29196
dtype: int64


In [25]:
empty_list_columns_count = 0
for column in df.columns:
    if all(isinstance(item, list) and not item for item in df[column]):
        empty_list_columns_count += 1

# Example usage:
print(f"Number of columns with all empty lists: {empty_list_columns_count}")

Number of columns with all empty lists: 0


In [26]:
# 7. Unique Value Counts for a specific column (Example: university_key)
if 'university_key' in df.columns:
    print("\n7. Value counts for 'university_key':")
    print(df['university_key'].value_counts())
else:
    print("\n7. 'university_key' column not found.")


7. Value counts for 'university_key':
university_key
GR_UOP              17085
IT_UNISS             8167
ES_UIB               6150
PL_ZUT               3518
FR_ULHN              2456
FR_UAG               2160
PT_UAC               1986
HR_UNIDU             1208
FO_UF                 632
DE_HOCHSTRALSUND      505
BG_BFU                400
FI_AUAS                61
SL_EMUNI               53
Name: count, dtype: int64


In [27]:
# Display a few random papers with all columns displayed
print("\nDisplaying a few random authors with all columns:")
random_authors = df.sample(3)
print(random_authors)



Displaying a few random authors with all columns:
                                     id  \
27616  https://openalex.org/A5043075188   
35913  https://openalex.org/A5052873518   
32150  https://openalex.org/A5114942097   

                                                     ids  \
27616  {'openalex': 'https://openalex.org/A5043075188...   
35913  {'openalex': 'https://openalex.org/A5052873518...   
32150  {'openalex': 'https://openalex.org/A5114942097...   

                 display_name  \
27616             Weidong Lyu   
35913  Pascal-M. Aggensteiner   
32150         P. E. Contarini   

                               display_name_alternatives  \
27616                         [Weidong Lyu, Lyu Weidong]   
35913  [P M Aggensteiner, Pascal‐M. Aggensteiner, Pas...   
32150                    [PE Contarini, P. E. Contarini]   

                                            affiliations  cited_by_count  \
27616  [{'institution': {'country_code': 'CN', 'displ...              43   
35913  [{

---

In [28]:
df.sample(1).to_csv('example_row.csv', sep=',', index=False)

## Data preparation

In [None]:
# drop the 'concepts' column (don't confuse with x-concepts which is fine)

Save back to a parquet file

In [29]:
# parquet_file_path = 'authors_clean_data.parquet'

# try:
#     # Save the DataFrame to a Parquet file
#     # `index=False` prevents writing the DataFrame index as a column
#     df.to_parquet(parquet_file_path, index=False, engine='pyarrow')
#     print(
#         f"Successfully saved data for {len(df)} papers to {parquet_file_path}")

# except Exception as e:
#     print(f"Error saving data to Parquet: {e}")