In [1]:
import pandas as pd
import os

parquet_path = '../local_data/gold/gold_institute_engagement/part-00000-tid-8721749531217473260-c3f47482-49cb-433f-b699-ef054b03890e-4609-1-c000.snappy.parquet'

# Check if file exists first
assert os.path.exists(parquet_path), f"File not found: {parquet_path}"

# Load Parquet file into DataFrame
df_institutes = pd.read_parquet(parquet_path, engine='pyarrow')

print(df_institutes.head())


               institute_id                              institute_name  \
0  6655c53d12a5ecbbb8d60bc7                      University of Helsinki   
1  6640ae46357bb23c781f7cc5                                       VAMIA   
2  6564ecca6fcf32747f42b2f2                                   Univacity   
3  63c00d416d8621c06f88f9e4           International European University   
4  63ca77e2d451eb29380b6a38  Universidad Católica San Antonio de Murcia   

  institute_country institute_type  total_views  total_impressions  avg_ctr  \
0           Finland         PUBLIC         2316               8425    0.275   
1           Finland        PRIVATE        27763             169202    0.164   
2    United Kingdom        PRIVATE           73                483    0.151   
3            Poland        PRIVATE       272410            2058971    0.132   
4             Spain        PRIVATE       282537            2238218    0.126   

   rank_by_ctr  
0            1  
1            2  
2            3  
3     

In [2]:
import psycopg2
from dotenv import load_dotenv
import os

# Load env vars
load_dotenv()

conn = psycopg2.connect(
    host=os.getenv("PG_HOST"),
    port=os.getenv("PG_PORT"),
    user=os.getenv("PG_USER"),
    password=os.getenv("PG_PASSWORD"),
    dbname=os.getenv("PG_DB")
)

cur = conn.cursor()


#### Country_id links each institution to the right country via its numeric ID.This is necessary for relational integrity and proper joins in thedatabase.

In [None]:
cur.execute("SELECT country_id, institute_country FROM countries;")
rows = cur.fetchall()

# mapping from country name to country_id
country_mapping = {row[1]: row[0] for row in rows}

cur.close()
conn.close()

print(country_mapping) 

In [12]:
# Show unmapped countries
unmapped = df_institutes[df_institutes['country_id'].isna()]['institute_country'].unique()
print("Unmapped countries:", unmapped)

Unmapped countries: ['Ghana' 'Rwanda' 'Zimbabwe']


In [13]:
country_mapping.update({
    'Ghana': 27,
    'Rwanda': 28,
    'Zimbabwe': 29
})

In [14]:
df_institutes['country_id'] = df_institutes['institute_country'].map(country_mapping).astype(int)

In [15]:
print(df_institutes[['institute_country', 'country_id']].head())

  institute_country  country_id
0           Finland          21
1           Finland          21
2    United Kingdom           1
3            Poland          14
4             Spain           8


In [16]:
df_to_insert = df_institutes.rename(columns={
    'institute_name': 'institution_name',
    'institute_type': 'institution_type',
    'avg_ctr': 'ctr',
    'total_views': 'total_views',
    'total_impressions': 'total_impressions'
})

# Keep only the needed columns
df_to_insert = df_to_insert[['institution_name', 'country_id', 'institution_type', 'ctr', 'total_views', 'total_impressions']]

In [17]:
print(df_to_insert.head())

                             institution_name  country_id institution_type  \
0                      University of Helsinki          21           PUBLIC   
1                                       VAMIA          21          PRIVATE   
2                                   Univacity           1          PRIVATE   
3           International European University          14          PRIVATE   
4  Universidad Católica San Antonio de Murcia           8          PRIVATE   

     ctr  total_views  total_impressions  
0  0.275         2316               8425  
1  0.164        27763             169202  
2  0.151           73                483  
3  0.132       272410            2058971  
4  0.126       282537            2238218  


In [6]:
conn.rollback()

In [36]:
existing_country_ids = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}

invalid_country_rows = df_to_insert[~df_to_insert['country_id'].isin(existing_country_ids)]

print(invalid_country_rows[['institution_name', 'country_id']])

       institution_name  country_id
90  Easy Travel Consult          27
93     Elrukz Group LTD          28
94        Study Connect          29
96      ApplyUni Abroad          27


In [40]:
missing_ids = {27, 28, 29}
missing_countries = df_institutes[df_institutes['country_id'].isin(missing_ids)][['country_id', 'institute_country']].drop_duplicates()

print(missing_countries)

    country_id institute_country
90          27             Ghana
93          28            Rwanda
94          29          Zimbabwe


In [42]:
insert_countries_query = """
INSERT INTO countries (country_id, institute_country)
VALUES
    (27, 'Ghana'),
    (28, 'Rwanda'),
    (29, 'Zimbabwe')
ON CONFLICT (country_id) DO NOTHING;
"""  # ON CONFLICT avoids duplicate errors if they exist

cur = conn.cursor()        # Create cursor
cur.execute(insert_countries_query)  # Execute query
conn.commit()              # Commit transaction
cur.close()                # Close cursor

In [43]:
cur = conn.cursor()

insert_query = """
INSERT INTO institutions (
    institution_name, country_id, institution_type, ctr, total_views, total_impressions, created_at, updated_at
) VALUES (%s, %s, %s, %s, %s, %s, NOW(), NOW())
"""

for _, row in df_to_insert.iterrows():
    cur.execute(insert_query, (
        row['institution_name'],
        int(row['country_id']),
        row['institution_type'],
        float(row['ctr']),
        int(row['total_views']),
        int(row['total_impressions'])
    ))

conn.commit()
cur.close()

In [None]:
cur = conn.cursor()
cur.execute("""
    SELECT i.institution_name, c.institute_country, i.institution_type, i.ctr
    FROM institutions i
    JOIN countries c ON i.country_id = c.country_id
    LIMIT 10;
""")
rows = cur.fetchall()
cur.close()

for row in rows:
    print(row)