In [None]:
import sys
import os
import pandas as pd
import numpy as np
import csv

In [None]:
print(os.getcwd())

# create base_directory

In [None]:
# loead data from csv files
base_dir = "/Users/maryammoradi/Documents/Documents/uni/Thesis/bc_thesis"

In [None]:
def load_csv(filename, encoding="utf-8"):
#Load a CSV from the same folder as this script, with fallback encodings.
    path = os.path.join(base_dir, filename)
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {filename}")
    try:
        df = pd.read_csv(path, encoding=encoding)
    except UnicodeDecodeError:
    # support for latin encoding if utf-8 fails
        df = pd.read_csv(path, encoding="latin1")
    print(f"Loaded: {filename:<45} → shape={df.shape}")
    return df

# Load all datasets

In [None]:
df_gdp=load_csv("gdp_pc_ppp_const2021.csv")
df_bmi=load_csv("High_BMI.CSV")
df_incident=load_csv("incident.csv")
df_internet=load_csv("Internet users (% of population).csv")
df_mortality_data=load_csv("Mortality data.csv")
df_mortality=load_csv("mortality.csv")
df_population65=load_csv("Population_ages_65(%od total).csv")
df_urban=load_csv("Urban population (% of total population).csv")
df_smoking=load_csv("Female_smoking_prevalence.csv")
df_health=load_csv("Current_health_expenditure.csv")
df_broadband=load_csv("Fixed_broadband_subscriptions (per 100 people).csv")


In [None]:
# The file of gdb needs latin1 encoding.
df_gdb = load_csv("GDB.csv", encoding="latin1")
print("All files loaded successfully.")

In [None]:
#checking and printing the shapes of the data frames
print([df_gdp.shape,
       df_bmi.shape,
       df_incident.shape,
       df_internet.shape,
       df_mortality_data.shape,
       df_mortality.shape,
       df_population65.shape,
       df_urban.shape,
       df_smoking.shape,
       df_health.shape,
       df_broadband.shape,
       df_gdb.shape])

Shows top 5 rows for all datasets

In [None]:
df_gdp=pd.read_csv(os.path.join(base_dir, "gdp_pc_ppp_const2021.csv"))
print("df_gdp:")
print(df_gdp.head())

Deleting extra columns

In [None]:
columns_to_drop = ['Series Name', 'Series Code']

Add columns before 1990 and 2024

In [None]:
for year in range (1960, 1990):
    columns_to_drop.append(f'{year} [YR{year}]')
columns_to_drop.append('2024[YR2024]')

In [None]:
df_gdp_cleaned =df_gdp.drop(columns=columns_to_drop, errors='ignore')

Changing format with melt function from wide to long, fix 'Country Name' and 'Country Code'

In [None]:
df_gdp_long =df_gdp_cleaned.melt( id_vars=['Country Name', 'Country Code'],
var_name='Year_Row',
value_name= 'gdp_pc_ppp_const'
)

In [None]:
df_gdp_long['Year'] = df_gdp_long['Year_Row'].str.extract(r'(\d{4})').astype(int)
df_gdp_long= df_gdp_long.drop(columns=['Year_Row'])

In [None]:
missing_percentage = df_gdp_long['gdp_pc_ppp_const'].isnull().sum() / len(df_gdp_long) * 100
print(f" percent of NaN in gdp_pc_ppp_const column: {missing_percentage:.2f}%")

In [None]:
print(df_gdp_long.head()) 

‌BMI Data :

In [None]:
df_bmi = pd.read_csv(os.path.join(base_dir, "High_BMI.csv"))
print("df_bmi:")
print(df_bmi.head())

Firstly we need to filter data base on specific period of age and relevant sex because breast cancer risk straight linked to to BMI women.
Typically sex_id = 2 related to Females

In [None]:
df_bmi_filtered=df_bmi[(df_bmi['sex'] == 'Female')| (df_bmi['sex_id'] ==2)].copy()

delet  extra columns

In [None]:
columns_to_drop =['age_group_id', 'age_group_name',
    'sex_id', 'sex',
    'measure','definition',
    'lower','upper','location_id']

In [None]:
df_bmi_filtered= df_bmi_filtered.rename(columns={'location_name':'country name', 
    'year_id':'Year',
    'mean':'bmi_female'})

Define pannel format

In [None]:
df_bmi_finally= df_bmi_filtered[['country name', 'Year', 'bmi_female']].copy()

Be sure For data Type

In [None]:
df_bmi_finally['Year']=df_bmi_finally['Year'].astype(int)

In [None]:
missing_percentage = df_bmi_finally['bmi_female'].isnull().sum() / len(df_bmi_finally) * 100
print(f" percent of NaN in bmi_female column: {missing_percentage:.2f}%")

In [None]:
print(df_bmi_finally.head())

Broadband data:

In [None]:
df_broadband = pd.read_csv(os.path.join(base_dir, "Fixed_broadband_subscriptions (per 100 people).csv"))
print("df_broadbank:")
print(df_broadband.head())

Cleaning data with identify columns like country name and country Id and the value columns  like amount of that

In [None]:
id_cols = ['Country Name', 'Country Code']
value_cols = [f'{year} [YR{year}]' for year in range(1990, 2024)]
columns_to_keep= id_cols + value_cols


Filter to keep necessary columns

In [None]:
df_broadband_filtered = df_broadband[columns_to_keep].copy()

In [None]:
df_broadband_long= df_broadband_filtered.melt(id_vars=id_cols, 
    var_name='Year_Raw',value_name='broadband_subs')

clean and convert year column to Numeric

In [None]:
df_broadband_long['broadband_subs']= pd.to_numeric(df_broadband_long['broadband_subs'],
errors='coerce')


change the temperory and raw columns

In [None]:
df_broadband_long['Year'] = df_broadband_long['Year_Raw'].str.extract(r'(\d{4})').astype(int)

In [None]:
print(df_broadband_long[['Country Code','Year_Raw','Year']].head()) 

delet Year_Raw column

In [None]:
df_broadband_long =df_broadband_long.drop(columns='Year_Raw')

In [None]:
print(df_broadband_long.head())

In [None]:
print(df_broadband_long.columns.tolist())

Missing data percentage:

In [None]:
missing_percentage = df_broadband_long['broadband_subs'].isnull().sum() / len(df_broadband_long) * 100
print(f" percent of NaN in Broadband_Subs column: {missing_percentage:.2f}%")

In [None]:
df_health = pd.read_csv(os.path.join(base_dir, "Current_health_expenditure.csv"))
print("df_health:")
print(df_health.head())

HEALTH DATA:

Fix columns like before

In [None]:
id_cols= ['Country Name', 'Country Code']

choosing value  for 1990 up to 2024

In [None]:
value_columns=[f'{year} [YR{year}]' for year in range(1990,2024)]

In [None]:
colums_to_keep=id_cols+ value_columns

Filtering

In [None]:
df_health_filtered= df_health[columns_to_keep].copy()

Changing wide to long with Melt funtion 

In [None]:
df_health_long=df_health_filtered.melt(id_vars= id_cols,
    var_name='Year_Raw',
    value_name='health_expenditure')

Cleaning Year_raw and changing string to Num

In [None]:
df_health_long['Year']= df_health_long['Year_Raw'].str.extract(r'(\d{4})').astype('Int64')

In [None]:
df_health_long['health_expenditure']=pd.to_numeric(df_health_long['health_expenditure'],
errors='coerce')
df_health_long = df_health_long.drop(columns=['Year_Raw'])


In [None]:
print(df_health_long.head())

In [None]:
# 1. FIX: Use the correct column name 'Health_Exp_GDP'.
# 2. FIX: Correct the parentheses: (len(df_health_long)) * 100

missing_percentage = df_health_long['health_expenditure'].isnull().sum() / len(df_health_long) * 100
print(f"Percent of NaN in health expenditure column: {missing_percentage:.2f}%")

INCIDENT:

In [None]:
df_incident = pd.read_csv(os.path.join(base_dir, "incident.csv"))
print("df_incident")
print(df_incident.head())

Drop unnessery columns for having useful ones for the Model

In [None]:
columns_drops=['measure_id','measure_name','location_id','location_name',
'sex_id','sex_name','age_id','age_name',
'cause_id', 'cause_name', 'metric_id',
 'metric_name', 'upper', 'lower']

In [None]:
df_incident_cleaned= df_incident.drop(columns=columns_drops)

Rename columns of Country Name and Year for matching with other datasets.

In [None]:
df_incident_cleaned= df_incident_cleaned.rename(columns={'location_name': 'Country Name','val': 'Incidence_Rate_per_100k'})
print(df_incident_cleaned.columns.tolist())

Final columns after dropping extra ones and check for data type for Year

In [None]:
df_incident_final= df_incident_cleaned[['year', 'Incidence_Rate_per_100k']].copy()
df_incident_final=df_incident_final.rename(columns={'year': 'Year'})
df_incident_final['year'] = df_incident_final['Year'].astype(int)
print(df_incident_final.head())

In [None]:
missing_percentage = df_incident_final['Incidence_Rate_per_100k'].isnull().sum() / len(df_incident_final) * 100
print(f" percent of NaN in Incidence_Rate_per_100k column: {missing_percentage:.2f}%")

INTERNET USAGE:

In [None]:
df_internet = pd.read_csv(os.path.join(base_dir, "internet users (% of population).csv"))
print("df_internet:")
print(df_internet.head())

Drop unnessery columns for having useful ones for the Model

In [None]:
columns_drop= ['Series Name', 'Series Code',]

In [None]:
df_internet_cleaned=df_internet.drop(columns= columns_drop)
#print(df_internet_cleaned.columns.tolist())

In [None]:
id_cols= ['Country Name', 'Country Code']
value_cols= [f'{year} [YR{year}]' for year in range (1990,2024)]

In [None]:
columns_to_keep= id_cols+ value_cols

In [None]:
df_internet_filtered= df_internet_cleaned[columns_to_keep].copy()

Melt data from wide to long

In [237]:
df_internet_long= df_internet_filtered.melt(id_vars=id_cols, 
    var_name='Year_Raw',
value_name='Internet_Value')
print(df_internet_filtered.head())

     Country Name Country Code 1990 [YR1990] 1991 [YR1991] 1992 [YR1992]  \
0     Afghanistan          AFG             0            ..            ..   
1         Albania          ALB             0            ..            ..   
2         Algeria          DZA             0            ..            ..   
3  American Samoa          ASM             0            ..            ..   
4         Andorra          AND             0            ..            ..   

  1993 [YR1993] 1994 [YR1994] 1995 [YR1995] 1996 [YR1996] 1997 [YR1997]  ...  \
0            ..            ..            ..            ..            ..  ...   
1            ..            ..        0.0112        0.0322        0.0486  ...   
2            ..      0.000361       0.00177       0.00174        0.0103  ...   
3            ..            ..            ..            ..            ..  ...   
4            ..            ..            ..          1.53          3.05  ...   

  2014 [YR2014] 2015 [YR2015] 2016 [YR2016] 2017 [YR2017] 2018

In [None]:
df_internet_long['Year']= df_internet_long['Year_Raw'].str.extract(r'(\d{4})').astype('int64')

['Country Name', 'Country Code', 'Year_Raw', 'Internet_Value', 'Year']


Changing internet amount to Num

In [251]:
df_internet_long['Internet_Value']= pd.to_numeric(df_internet_long['Internet_Value'],
errors='coerce')
print(df_internet_long.head())

     Country Name Country Code  Internet_Value  Year
0     Afghanistan          AFG             0.0  1990
1         Albania          ALB             0.0  1990
2         Algeria          DZA             0.0  1990
3  American Samoa          ASM             0.0  1990
4         Andorra          AND             0.0  1990


In [252]:
missing_percentage = df_internet_long['Internet_Value'].isnull().sum() / len(df_internet_long) * 100
print(f" percent of NaN in Incidence_Rate_per_100k column: {missing_percentage:.2f}%")

 percent of NaN in Incidence_Rate_per_100k column: 27.76%


MORTALITY DATA:

In [None]:
df_mortality_data  = pd.read_csv(os.path.join(base_dir, "mortality data.csv"))
print("df_mortality_data:")
print(df_mortality_data.head())

POPULATION 65 and above:

In [None]:
df_population65 = pd.read_csv(os.path.join(base_dir, "Population_ages_65(%od total).csv"))
print("df_population65:")
print(df_population65.head())

SMOKING:

In [None]:
df_smoking =pd.read_csv(os.path.join(base_dir, "Female_smoking_prevalence.csv"))
print("df_smoking:")
print(df_smoking.head())