In [4]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set directory path
data_dir = "../data"

# List of dataset filenames
datasets = {
    "euroSciVoc": "euroSciVoc.csv",
    "legalBasis": "legalBasis.csv",
    "organization": "organization.csv",
    "project": "project.csv",
    "topics": "topics.csv",
    "webItem": "webItem.csv",
    "webLink": "webLink.csv"
}

# Load datasets into pandas DataFrames
def load_data(filename):
    filepath = os.path.join(data_dir, filename)
    return pd.read_csv(filepath, delimiter=';', low_memory=False, on_bad_lines='skip')

In [5]:
# Load euroSciVoc dataset
df_euroSciVoc = load_data(datasets["euroSciVoc"])

# Display basic info
print("Dataset: euroSciVoc")
print(df_euroSciVoc.info())
print(df_euroSciVoc.head())
print("-" * 80)

# Check for missing values
missing_values = df_euroSciVoc.isnull().sum()
print("Missing values in euroSciVoc:")
print(missing_values[missing_values > 0])
print("-" * 80)

# Basic statistics
print("Statistics for euroSciVoc:")
print(df_euroSciVoc.describe())
print("-" * 80)

Dataset: euroSciVoc
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38789 entries, 0 to 38788
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   projectID              38789 non-null  int64  
 1   euroSciVocCode         38789 non-null  object 
 2   euroSciVocPath         38789 non-null  object 
 3   euroSciVocTitle        38789 non-null  object 
 4   euroSciVocDescription  0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.5+ MB
None
   projectID         euroSciVocCode  \
0  101116741             /29/97/543   
1  101163161  /27/81/30021/30833628   
2  101163161    /23/43/251/48354418   
3  101163161         /23/43/257/761   
4  101163161                 /29/89   

                                      euroSciVocPath      euroSciVocTitle  \
0  /social sciences/political sciences/government...   government systems   
1  /agricultural sciences/agriculture, forestry, ...

In [6]:
# Load legalBasis dataset
df_legalBasis = load_data(datasets["legalBasis"])

# Display basic info
print("Dataset: legalBasis")
print(df_legalBasis.info())
print(df_legalBasis.head())
print("-" * 80)

# Check for missing values
missing_values = df_legalBasis.isnull().sum()
print("Missing values in legalBasis:")
print(missing_values[missing_values > 0])
print("-" * 80)

# Basic statistics
print("Statistics for legalBasis:")
print(df_legalBasis.describe())
print("-" * 80)

Dataset: legalBasis
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20512 entries, 0 to 20511
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   projectID            20512 non-null  int64 
 1   legalBasis           20512 non-null  object
 2   title                20512 non-null  object
 3   uniqueProgrammePart  15341 non-null  object
dtypes: int64(1), object(3)
memory usage: 641.1+ KB
None
   projectID   legalBasis                            title uniqueProgrammePart
0  101116741  HORIZON.1.1  European Research Council (ERC)                True
1  101163161  HORIZON.1.1  European Research Council (ERC)                True
2  101160499  HORIZON.1.1  European Research Council (ERC)                True
3  101166905  HORIZON.1.1  European Research Council (ERC)                True
4  101162875  HORIZON.1.1  European Research Council (ERC)                True
-----------------------------------------------

In [None]:
# Load organization dataset
df_organization = load_data(datasets["organization"])

# Display basic info
print("Dataset: organization")
print(df_organization.info())
print(df_organization.head())
print("-" * 80)

# Check for missing values
missing_values = df_organization.isnull().sum()
print("Missing values in organization:")
print(missing_values[missing_values > 0])
print("-" * 80)

# Basic statistics
print("Statistics for organization:")
print(df_organization.describe())
print("-" * 80)