# Gold Layer Data Exploration

This notebook explores the gold layer Parquet data to understand the structure and content for the MCP server design.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

# Set display options for better data viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

## Load Gold Layer Datasets

In [2]:
def load_gold_dataset(dataset_name):
    """Load a gold layer dataset from its directory"""
    base_path = Path("local_data/gold")
    dataset_path = base_path / dataset_name
    
    # Find the parquet file in the directory
    parquet_files = list(dataset_path.glob("*.parquet"))
    
    if not parquet_files:
        print(f"No parquet files found in {dataset_path}")
        return None
    
    # Load the parquet file
    df = pd.read_parquet(parquet_files[0])
    print(f"Loaded {dataset_name}: {df.shape}")
    return df

In [3]:
# Load all gold datasets
country_data = load_gold_dataset("gold_country_program_summary")
institute_engagement = load_gold_dataset("gold_institute_engagement")
institute_summary = load_gold_dataset("gold_institute_program_summary")
program_engagement = load_gold_dataset("gold_program_engagement")

Loaded gold_country_program_summary: (26, 8)
Loaded gold_institute_engagement: (98, 8)
Loaded gold_institute_program_summary: (76, 10)
Loaded gold_program_engagement: (5977, 11)


## Explore Country Program Summary Data

In [4]:
if country_data is not None:
    print("Country Program Summary Dataset")
    print("=" * 40)
    print(f"Shape: {country_data.shape}")
    print(f"Columns: {list(country_data.columns)}")
    print("\nData Types:")
    print(country_data.dtypes)
    print("\nFirst 5 rows:")
    display(country_data.head())

Country Program Summary Dataset
Shape: (26, 8)
Columns: ['institute_country', 'num_institutes', 'total_programs', 'avg_institute_ctr', 'total_views', 'total_impressions', 'country_rank', 'top_programs']

Data Types:
institute_country     object
num_institutes         int64
total_programs         int64
avg_institute_ctr    float64
total_views            int64
total_impressions      int64
country_rank           int32
top_programs          object
dtype: object

First 5 rows:


Unnamed: 0,institute_country,num_institutes,total_programs,avg_institute_ctr,total_views,total_impressions,country_rank,top_programs
0,United Kingdom,6,790,0.62,915932,2099792,1,"[{'program_name': 'Zoology', 'ctr': 0.33578431..."
1,Portugal,1,26,0.598,2852,5724,2,"[{'program_name': 'Wine, Tourism and Innovatio..."
2,Turkey,4,294,0.545,311593,997145,3,"[{'program_name': 'Zootechnique', 'ctr': 0.852..."
3,Canada,4,320,0.526,470027,1729844,4,[{'program_name': 'Workplace Safety and Preven...
4,United States,3,128,0.521,407046,1044612,5,[{'program_name': 'Visual and Performing Arts'...


In [None]:
# Explore the top_programs column structure if it exists
if country_data is not None and 'top_programs' in country_data.columns:
    print("Top Programs Column Structure:")
    print("=" * 30)
    sample_top_programs = country_data['top_programs'].iloc[0]
    print(f"Type: {type(sample_top_programs)}")
    print(f"Sample value: {sample_top_programs}")

## Explore Institute Engagement Data

In [None]:
if institute_engagement is not None:
    print("Institute Engagement Dataset")
    print("=" * 40)
    print(f"Shape: {institute_engagement.shape}")
    print(f"Columns: {list(institute_engagement.columns)}")
    print("\nData Types:")
    print(institute_engagement.dtypes)
    print("\nFirst 5 rows:")
    display(institute_engagement.head())

In [None]:
# Explore institute types
if institute_engagement is not None and 'institute_type' in institute_engagement.columns:
    print("Institute Types:")
    print(institute_engagement['institute_type'].value_counts())

## Explore Institute Program Summary Data

In [5]:
if institute_summary is not None:
    print("Institute Program Summary Dataset")
    print("=" * 40)
    print(f"Shape: {institute_summary.shape}")
    print(f"Columns: {list(institute_summary.columns)}")
    print("\nData Types:")
    print(institute_summary.dtypes)
    print("\nFirst 5 rows:")
    display(institute_summary.head())

Institute Program Summary Dataset
Shape: (76, 10)
Columns: ['institute_id', 'institute_name', 'institute_country', 'institute_type', 'num_programs', 'institute_avg_ctr', 'institute_total_views', 'institute_total_impressions', 'institute_rank', 'top_programs']

Data Types:
institute_id                    object
institute_name                  object
institute_country               object
institute_type                  object
num_programs                     int64
institute_avg_ctr              float64
institute_total_views            int64
institute_total_impressions      int64
institute_rank                   int32
top_programs                    object
dtype: object

First 5 rows:


Unnamed: 0,institute_id,institute_name,institute_country,institute_type,num_programs,institute_avg_ctr,institute_total_views,institute_total_impressions,institute_rank,top_programs
0,67dd448888892aacf5966f88,University of Santiago de Compostela,Spain,PUBLIC,91,0.697,5937,10143,1,[{'program_name': 'Work and Organizational Psy...
1,66685c63e09c14d40915d47b,University of Bath,United Kingdom,PUBLIC,135,0.69,90246,188397,2,[{'program_name': 'Translation and Professiona...
2,63e3721cb1a6067d99a80abf,Imperial College London,United Kingdom,PUBLIC,138,0.65,218945,498933,3,"[{'program_name': 'Transport', 'ctr': 0.480369..."
3,6478832f4097f1003d17916c,Sabanci University,Turkey,PRIVATE,25,0.644,37097,75229,4,[{'program_name': 'Visual Arts and Visual Comm...
4,67d1367888892aacf53f6c46,University of Jaén,Spain,PUBLIC,35,0.634,3673,5878,5,[{'program_name': 'Telecommunication Engineeri...


## Explore Program Engagement Data

In [6]:
if program_engagement is not None:
    print("Program Engagement Dataset")
    print("=" * 40)
    print(f"Shape: {program_engagement.shape}")
    print(f"Columns: {list(program_engagement.columns)}")
    print("\nData Types:")
    print(program_engagement.dtypes)
    print("\nFirst 5 rows:")
    display(program_engagement.head())

Program Engagement Dataset
Shape: (5977, 11)
Columns: ['name', 'program_id', 'institute_id', 'institute_name', 'countries', 'program_duration', 'tuition', 'total_views', 'total_impressions', 'avg_ctr', 'rank_by_ctr']

Data Types:
name                  object
program_id            object
institute_id          object
institute_name        object
countries             object
program_duration       int64
tuition              float64
total_views            int64
total_impressions      int64
avg_ctr              float64
rank_by_ctr            int32
dtype: object

First 5 rows:


Unnamed: 0,name,program_id,institute_id,institute_name,countries,program_duration,tuition,total_views,total_impressions,avg_ctr,rank_by_ctr
0,Renewable Energies and Energy Efficiency,67cee87f6400e7152f7c15db,67c5b84d8e2c2e341bf3fab8,University of Zaragoza,[Spain],48,1378.72,4854,4942,0.982193,1
1,Civil Engineering with Second Major in Entrepr...,66dac376625be65e46d8905d,66d823c2c6e125955aa84c48,Nanyang Technological University,[Singapore],48,5262.5,1039,1060,0.980189,2
2,Mechanical Engineering with Second Major in En...,66deaaa5625be65e46f28b4a,66d823c2c6e125955aa84c48,Nanyang Technological University,[Singapore],48,5262.5,632,646,0.978328,3
3,Aerospace Engineering with Second Major in Ent...,66dea9a1625be65e46f2781a,66d823c2c6e125955aa84c48,Nanyang Technological University,[Singapore],48,5262.5,457,469,0.974414,4
4,Natural Sciences at the Faculty of Environment...,668849f6832f8e31a77b65f7,6681a3b9832f8e31a753bf44,University of Graz,[Austria],36,363.36,4898,5044,0.971055,5


In [7]:
# Explore the countries column structure if it exists
if program_engagement is not None and 'countries' in program_engagement.columns:
    print("Countries Column Structure:")
    print("=" * 30)
    sample_countries = program_engagement['countries'].iloc[0]
    print(f"Type: {type(sample_countries)}")
    print(f"Sample value: {sample_countries}")

Countries Column Structure:
Type: <class 'numpy.ndarray'>
Sample value: ['Spain']


## Data Quality Analysis

In [8]:
def analyze_data_quality(df, name):
    """Analyze data quality for a dataset"""
    if df is None:
        return
    
    print(f"\nData Quality Analysis: {name}")
    print("=" * 50)
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print("Missing values:")
        print(missing[missing > 0])
    else:
        print("No missing values found")
    
    # Numeric column statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print("\nNumeric column statistics:")
        display(df[numeric_cols].describe())

# Analyze all datasets
analyze_data_quality(country_data, "Country Program Summary")
analyze_data_quality(institute_engagement, "Institute Engagement")
analyze_data_quality(institute_summary, "Institute Program Summary")
analyze_data_quality(program_engagement, "Program Engagement")


Data Quality Analysis: Country Program Summary
No missing values found

Numeric column statistics:


Unnamed: 0,num_institutes,total_programs,avg_institute_ctr,total_views,total_impressions,country_rank
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,2.923077,157.269231,0.4365,236515.961538,882475.6,13.5
std,2.279001,223.852908,0.08814,248119.750458,1103129.0,7.648529
min,1.0,1.0,0.289,2852.0,5724.0,1.0
25%,1.0,19.25,0.363,50400.75,141032.8,7.25
50%,2.0,101.0,0.4365,114807.5,468244.0,13.5
75%,4.0,176.0,0.49575,363620.0,1072267.0,19.75
max,11.0,897.0,0.62,915932.0,4092920.0,26.0



Data Quality Analysis: Institute Engagement
No missing values found

Numeric column statistics:


Unnamed: 0,total_views,total_impressions,avg_ctr,rank_by_ctr
count,98.0,98.0,98.0,98.0
mean,15587.102041,374068.1,0.029816,48.479592
std,42457.496555,607996.7,0.038589,28.043129
min,0.0,135.0,0.0,1.0
25%,250.5,47312.5,0.0065,25.0
50%,3205.5,122859.0,0.0205,49.5
75%,13949.0,329606.8,0.039,73.5
max,282537.0,3476344.0,0.275,94.0



Data Quality Analysis: Institute Program Summary
No missing values found

Numeric column statistics:


Unnamed: 0,num_programs,institute_avg_ctr,institute_total_views,institute_total_impressions,institute_rank
count,76.0,76.0,76.0,76.0,76.0
mean,53.802632,0.459592,80913.355263,301899.6,38.368421
std,67.13251,0.126128,91701.702125,477766.0,22.061032
min,1.0,0.103,608.0,1204.0,1.0
25%,7.0,0.3785,11389.75,25656.5,19.5
50%,20.5,0.472,59117.5,155524.0,38.0
75%,79.75,0.5535,105509.5,317337.8,56.5
max,301.0,0.697,470453.0,2797223.0,76.0



Data Quality Analysis: Program Engagement
Missing values:
countries    8
dtype: int64

Numeric column statistics:


Unnamed: 0,program_duration,tuition,total_views,total_impressions,avg_ctr,rank_by_ctr
count,5977.0,5977.0,5977.0,5977.0,5977.0,5977.0
mean,30.676092,9484.389836,1359.90614,4634.69,0.519336,2988.49038
std,14.852222,16373.761987,12080.036274,33591.3,0.189008,1725.752056
min,1.0,0.0,1.0,13.0,0.006993,1.0
25%,16.0,1510.0,62.0,127.0,0.390977,1495.0
50%,24.0,5262.5,126.0,258.0,0.522124,2989.0
75%,48.0,14000.0,312.0,666.0,0.656085,4483.0
max,96.0,1000000.0,816919.0,1239236.0,0.982193,5977.0
