In [None]:
import sys
import os
import pandas as pd
import numpy as np
import csv

In [None]:
print(os.getcwd())

# create base_directory

In [None]:
# loead data from csv files
base_dir = "/Users/maryammoradi/Documents/Documents/uni/Thesis/bc_thesis"

In [None]:
def load_csv(filename, encoding="utf-8"):
#Load a CSV from the same folder as this script, with fallback encodings.
    path = os.path.join(base_dir, filename)
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {filename}")
    try:
        df = pd.read_csv(path, encoding=encoding)
    except UnicodeDecodeError:
    # support for latin encoding if utf-8 fails
        df = pd.read_csv(path, encoding="latin1")
    print(f"Loaded: {filename:<45} → shape={df.shape}")
    return df

# Load all datasets

In [None]:
df_gdp=load_csv("gdp_pc_ppp_const2021.csv")
df_bmi=load_csv("High_BMI.CSV")
df_incident=load_csv("incident.csv")
df_internet=load_csv("Internet users (% of population).csv")
df_mortality_data=load_csv("Mortality data.csv")
df_mortality=load_csv("mortality.csv")
df_population65=load_csv("Population_ages_65(%od total).csv")
df_urban=load_csv("Urben_population(% of total population).csv")
df_Smoking=load_csv("smoking_female.csv")
df_health=load_csv("Current_health_expenditure.csv")
df_broadband=load_csv("Fixed_broadband_subscriptions (per 100 people).csv")
df_fertility=load_csv("Fertility rate, total (births per woman).csv")
df_life_expectency =load_csv("Life_expectancy_at_birth_female_(years).csv")


In [None]:
# The file of gdb needs latin1 encoding.
df_gdb = load_csv("GDB.csv", encoding="latin1")
print("All files loaded successfully.")

In [None]:
#checking and printing the shapes of the data frames
print([df_gdp.shape,
       df_bmi.shape,
       df_incident.shape,
       df_internet.shape,
       df_mortality_data.shape,
       df_mortality.shape,
       df_population65.shape,
       df_urban.shape,
       df_smoking.shape,
       df_health.shape,
       df_broadband.shape,
       df_fertility.shape,
       df_life_expectency.shape])

Shows top 5 rows for all datasets

In [None]:
df_gdp=pd.read_csv(os.path.join(base_dir, "gdp_pc_ppp_const2021.csv"))
print("df_gdp:")
print(df_gdp.head())

Deleting extra columns

In [None]:
columns_to_drop = ['Series Name', 'Series Code']

Add columns before 1990 and 2024

In [None]:
for year in range (1960, 1990):
    columns_to_drop.append(f'{year} [YR{year}]')
columns_to_drop.append('2024[YR2024]')

In [None]:
df_gdp_cleaned =df_gdp.drop(columns=columns_to_drop, errors='ignore')

Changing format with melt function from wide to long, fix 'Country Name' and 'Country Code'

In [None]:
df_gdp_long =df_gdp_cleaned.melt( id_vars=['Country Name', 'Country Code'],
var_name='Year_Row',
value_name= 'gdp_pc_ppp_const'
)

In [None]:
df_gdp_long['Year'] = df_gdp_long['Year_Row'].str.extract(r'(\d{4})').astype(int)
df_gdp_long= df_gdp_long.drop(columns=['Year_Row'])

In [None]:
missing_percentage = df_gdp_long['gdp_pc_ppp_const'].isnull().sum() / len(df_gdp_long) * 100
print(f" percent of NaN in gdp_pc_ppp_const column: {missing_percentage:.2f}%")

In [None]:
print(df_gdp_long.head()) 

‌BMI Data :

In [None]:
df_bmi = pd.read_csv(os.path.join(base_dir, "High_BMI.csv"))
print("df_bmi:")
print(df_bmi.head())

Firstly we need to filter data base on specific period of age and relevant sex because breast cancer risk straight linked to to BMI women.
Typically sex_id = 2 related to Females

In [None]:
df_bmi_filtered=df_bmi[(df_bmi['sex'] == 'Female')| (df_bmi['sex_id'] ==2)].copy()

delet  extra columns

In [None]:
columns_to_drop =['age_group_id', 'age_group_name',
    'sex_id', 'sex',
    'measure','definition',
    'lower','upper','location_id']

In [None]:
df_bmi_filtered= df_bmi_filtered.rename(columns={'location_name':'country name', 
    'year_id':'Year',
    'mean':'bmi_female'})

Define pannel format

In [None]:
df_bmi_finally= df_bmi_filtered[['country name', 'Year', 'bmi_female']].copy()

Be sure For data Type

In [None]:
df_bmi_finally['Year']=df_bmi_finally['Year'].astype(int)

In [None]:
missing_percentage = df_bmi_finally['bmi_female'].isnull().sum() / len(df_bmi_finally) * 100
print(f" percent of NaN in bmi_female column: {missing_percentage:.2f}%")

In [None]:
print(df_bmi_finally.head())

Broadband data:

In [None]:
df_broadband = pd.read_csv(os.path.join(base_dir, "Fixed_broadband_subscriptions (per 100 people).csv"))
print("df_broadbank:")
print(df_broadband.head())

Cleaning data with identify columns like country name and country Id and the value columns  like amount of that

In [None]:
id_cols = ['Country Name', 'Country Code']
value_cols = [f'{year} [YR{year}]' for year in range(1990, 2024)]
columns_to_keep= id_cols + value_cols


Filter to keep necessary columns

In [None]:
df_broadband_filtered = df_broadband[columns_to_keep].copy()

In [None]:
df_broadband_long= df_broadband_filtered.melt(id_vars=id_cols, 
    var_name='Year_Raw',value_name='broadband_subs')

clean and convert year column to Numeric

In [None]:
df_broadband_long['broadband_subs']= pd.to_numeric(df_broadband_long['broadband_subs'],
errors='coerce')


change the temperory and raw columns

In [None]:
df_broadband_long['Year'] = df_broadband_long['Year_Raw'].str.extract(r'(\d{4})').astype(int)

In [None]:
print(df_broadband_long[['Country Code','Year_Raw','Year']].head()) 

delet Year_Raw column

In [None]:
df_broadband_long =df_broadband_long.drop(columns='Year_Raw')

In [None]:
print(df_broadband_long.head())

In [None]:
print(df_broadband_long.columns.tolist())

Missing data percentage:

In [None]:
missing_percentage = df_broadband_long['broadband_subs'].isnull().sum() / len(df_broadband_long) * 100
print(f" percent of NaN in Broadband_Subs column: {missing_percentage:.2f}%")

In [None]:
df_health = pd.read_csv(os.path.join(base_dir, "Current_health_expenditure.csv"))
print("df_health:")
print(df_health.head())

HEALTH DATA:

Fix columns like before

In [None]:
id_cols= ['Country Name', 'Country Code']

choosing value  for 1990 up to 2024

In [None]:
value_columns=[f'{year} [YR{year}]' for year in range(1990,2024)]

In [None]:
colums_to_keep=id_cols+ value_columns

Filtering

In [None]:
df_health_filtered= df_health[columns_to_keep].copy()

Changing wide to long with Melt funtion 

In [None]:
df_health_long=df_health_filtered.melt(id_vars= id_cols,
    var_name='Year_Raw',
    value_name='health_expenditure')

Cleaning Year_raw and changing string to Num

In [None]:
df_health_long['Year']= df_health_long['Year_Raw'].str.extract(r'(\d{4})').astype('Int64')

In [None]:
df_health_long['health_expenditure']=pd.to_numeric(df_health_long['health_expenditure'],
errors='coerce')
df_health_long = df_health_long.drop(columns=['Year_Raw'])


In [None]:
print(df_health_long.head())

In [None]:
# 1. FIX: Use the correct column name 'Health_Exp_GDP'.
# 2. FIX: Correct the parentheses: (len(df_health_long)) * 100

missing_percentage = df_health_long['health_expenditure'].isnull().sum() / len(df_health_long) * 100
print(f"Percent of NaN in health expenditure column: {missing_percentage:.2f}%")

INCIDENT:

In [None]:
df_incident = pd.read_csv(os.path.join(base_dir, "incident.csv"))
print("df_incident")
print(df_incident.head())

Drop unnessery columns for having useful ones for the Model

In [None]:
columns_drops=['measure_id','measure_name','location_id','location_name',
'sex_id','sex_name','age_id','age_name',
'cause_id', 'cause_name', 'metric_id',
 'metric_name', 'upper', 'lower']

In [None]:
df_incident_cleaned= df_incident.drop(columns=columns_drops)

Rename columns of Country Name and Year for matching with other datasets.

In [None]:
df_incident_cleaned= df_incident_cleaned.rename(columns={'location_name': 'Country Name','val': 'Incidence_Rate_per_100k'})
print(df_incident_cleaned.columns.tolist())

Final columns after dropping extra ones and check for data type for Year

In [None]:
df_incident_final= df_incident_cleaned[['year', 'Incidence_Rate_per_100k']].copy()
df_incident_final=df_incident_final.rename(columns={'year': 'Year'})
df_incident_final['year'] = df_incident_final['Year'].astype(int)
print(df_incident_final.head())

In [None]:
missing_percentage = df_incident_final['Incidence_Rate_per_100k'].isnull().sum() / len(df_incident_final) * 100
print(f" percent of NaN in Incidence_Rate_per_100k column: {missing_percentage:.2f}%")

INTERNET USAGE:

In [None]:
df_internet = pd.read_csv(os.path.join(base_dir, "internet users (% of population).csv"))
print("df_internet:")
print(df_internet.head())

Drop unnessery columns for having useful ones for the Model

In [None]:
columns_drop= ['Series Name', 'Series Code',]

In [None]:
df_internet_cleaned=df_internet.drop(columns= columns_drop)
#print(df_internet_cleaned.columns.tolist())

In [None]:
id_cols= ['Country Name', 'Country Code']
value_cols= [f'{year} [YR{year}]' for year in range (1990,2024)]

In [None]:
columns_to_keep= id_cols+ value_cols

In [None]:
df_internet_filtered= df_internet_cleaned[columns_to_keep].copy()

Melt data from wide to long

In [None]:
df_internet_long= df_internet_filtered.melt(id_vars=id_cols, 
    var_name='Year_Raw',
value_name='Internet_Value')
print(df_internet_filtered.head())

In [None]:
df_internet_long['Year']= df_internet_long['Year_Raw'].str.extract(r'(\d{4})').astype('int64')

Changing internet amount to Num

In [None]:
df_internet_long['Internet_Value']= pd.to_numeric(df_internet_long['Internet_Value'],
errors='coerce')
print(df_internet_long.head())

In [None]:
missing_percentage = df_internet_long['Internet_Value'].isnull().sum() / len(df_internet_long) * 100
print(f" percent of NaN in Internet_Value column: {missing_percentage:.2f}%")

MORTALITY DATA:

df_mortality_data=load_csv("Mortality data.csv")
df_mortality=load_csv("mortality.csv")

In [None]:
df_mortality_data  = pd.read_csv(os.path.join(base_dir, "Mortality data.csv"))
print("df_mortality_data:")
print(df_mortality_data.head())

In [None]:
# بارگذاری مجدد فایل با جداکننده کاما
df_mortality_data = pd.read_csv(os.path.join(base_dir, "mortality data.csv"), sep=',')

print("✅ فایل با جداکننده کاما با موفقیت بارگذاری شد (ساختار صحیح):")
print(df_mortality_data.head())
print("\nستون‌های دیتافریم:")
print(df_mortality_data.columns.tolist())

POPULATION 65 and above:

In [None]:
df_population65 = pd.read_csv(os.path.join(base_dir, "Population_ages_65(%od total).csv"))
print("df_population65:")
print(df_population65.head())

Cleaning data from determining like before fix and value columns.

In [None]:
id_cols_pop =['Country Name', 'Country Code']
value_cols_pop=[f'{year} [YR{year}]' for year in range(1990,2024)]

deleting columns and filtering:

In [None]:
columns_for_delete= ['Series Name', 'Series Code']
df_population65_filtered= df_population65.drop(columns= columns_for_delete)

columns_for_keep_pop= id_cols_pop+ value_cols_pop
df_population65_filtered= df_population65_filtered[columns_for_keep_pop].copy()


changing format from wide to long.

In [None]:
df_population65_long= df_population65_filtered.melt(id_vars=id_cols_pop
    ,var_name='Year_raw', value_name='Population_65_pct')
#print(df_population65_long.columns.tolist())

last steps for cleaning for checking data type

In [None]:
df_population65_long['Year']=df_population65_long['Year_raw'].str.extract(r'(\d{4})').astype('Int64')
df_population65_long['Population_65_pct']=pd.to_numeric(
df_population65_long['Population_65_pct'], errors='coerce')
df_population65_final=df_population65_long.drop(columns=['Year_raw'])
print(df_population65_final.head())

In [None]:
missing_percentage = df_population65_final['Population_65_pct'].isnull().sum() / len(df_population65_final) * 100
print(f" percent of NaN in Population_65_pct column: {missing_percentage:.2f}%")

SMOKING:

In [None]:
df_smoking =pd.read_csv(os.path.join(base_dir, "smoking_female.csv"))
print("df_smoking:")
print(df_smoking.head())
#print(df_smoking.columns.tolist())

Cleaning and determinig fix columns and valuable columns:

In [None]:
id_cols=['Country Name', 'Country Code']
value_cols= [f'{year} [YR{year}]' for year in range (1990, 2024)]

In [None]:
columns_for_delete= ['Series Name', 'Series Code']
df_smoking_cleaned=df_smoking.drop(columns= columns_for_delete)
#print(df_smoking_cleaned.head())

keeping columns:

In [None]:
columns_for_keep= id_cols+value_cols
df_smoking_filtered= df_smoking_cleaned[columns_for_keep].copy()
df_smoking_long=df_smoking_filtered.melt(id_vars=id_cols, var_name= 'Year_Raw', value_name='smoking_prevalence')
df_smoking_long['Year'] = df_smoking_long['Year_Raw'].str.extract(r'(\d{4})').astype('Int64')
df_smoking_long['smoking_prevalence']=pd.to_numeric(df_smoking_long['smoking_prevalence'], errors= 'coerce')
df_smoking_final=df_smoking_long.drop(columns='Year_Raw')
print(df_smoking_final.head())

In [None]:
missing_percentage =df_smoking_final['smoking_prevalence'].isnull().sum() / len(df_smoking_final)*100
print(f"percentage of NaN in smoking_prevalence column:{missing_percentage: .2f}")

Life Expectency_Femail:

In [None]:
df_life_expectency=pd.read_csv(os.path.join(base_dir, "Life_expectancy_at_birth_female_(years).csv"))
print("df_life_expectency:")
print(df_life_expectency.head())

CLeaning 

In [None]:
id_cols= ['Country Name','Country Code']
value_cols=[f'{year} [YR{year}]' for year in range (1990,2024)]

drop extra columns and keep essential columns:

In [None]:
df_life_expectency_droped= df_life_expectency.drop(columns=['Series Name','Series Code'])
columns_to_keep= id_cols+value_cols
df_life_expectency_filtered=df_life_expectency_droped[columns_to_keep].copy()

Changing format from wide to long

In [None]:
df_life_expectency_long=df_life_expectency_filtered.melt(id_vars=id_cols,
    var_name='Year_Raw', value_name='Life_Expectancy_Female')

Changing Data type and deleting Year_Raw

In [None]:
df_life_expectency_long['Year'] = df_life_expectency_long['Year_Raw'].str.extract(r'(\d{4})').astype('Int64')
df_life_expectency_long['Life_Expectancy_Female'] = pd.to_numeric(
    df_life_expectency_long['Life_Expectancy_Female'], 
    errors='coerce')
df_life_expectency_final= df_life_expectency_long.drop(columns=['Year_Raw'])
print(df_life_expectency_final.head())

In [None]:
missing_percentage= df_life_expectency_final['Life_Expectancy_Female'].isnull().sum()/ len(df_life_expectency_final)*100
print(f'life_expectency_female column NaN percentage: {missing_percentage: .2f}%')

Fertility:

In [None]:
df_fertility=pd.read_csv(os.path.join(base_dir, "Fertility rate, total (births per woman).csv"))
print("df_fertility:")
print(df_fertility.head())
print(df_fertility.columns.tolist())

Cleaning:

In [None]:
id_cols= ['Country Name','Country Code']
value_cols=[f'{year} [YR{year}]' for year in range (1990,2024)]

drop extra columns and keep essential columns:

In [None]:
df_fertility_droped= df_fertility.drop(columns=['Series Name','Series Code'])
columns_to_keep= id_cols+value_cols
df_fertility_filtered=df_fertility_droped[columns_to_keep].copy()

Changing format from wide to long

In [None]:
df_fertility_long=df_fertility_filtered.melt(id_vars=id_cols,
    var_name='Year_Raw', value_name='Fertility_Female')

Changing Data type and deleting Year_Raw

In [None]:
df_fertility_long['Year'] = df_fertility_long['Year_Raw'].str.extract(r'(\d{4})').astype('Int64')
df_fertility_long['Fertility_Female'] = pd.to_numeric(
    df_fertility_long['Fertility_Female'], 
    errors='coerce')
df_fertility_long= df_fertility_long.drop(columns=['Year_Raw'])
print(df_fertility_long.head())

Missing Data:

In [None]:
missing_percentage= df_fertility_long['Fertility_Female'].isnull().sum()/ len(df_fertility_long)*100
print(f'Fertility_Female column NaN percentage: {missing_percentage: .2f}%')

Urben:

In [None]:
df_urban=pd.read_csv(os.path.join(base_dir, "Urben_population(% of total population).csv"))
print("df_urba:")
print(df_urban.head())

Cleaning:

In [194]:
id_cols= ['Country Name','Country Code']
value_cols=[f'{year} [YR{year}]' for year in range (1990,2024)]

drop extra columns and keep essential columns:

In [195]:
df_urban_droped= df_urban.drop(columns=['Series Name','Series Code'])
columns_to_keep= id_cols+value_cols
df_urban_filtered=df_urban_droped[columns_to_keep].copy()

Changing format from wide to long

In [197]:
df_urban_long=df_urban_filtered.melt(id_vars=id_cols, var_name='Year_Raw', value_name='Urben_pupulation_pct')

Changing Data type and deleting Year_Raw

In [204]:
df_urban_final = df_urban_long.copy()

In [210]:
missing_percentage = df_urban_final['Urben_pupulation_pct'].isnull().sum() / len(df_urban_final) * 100
print(f'Urben_pupulation_pct column NaN percentage: {missing_percentage: .2f}%')

Urben_pupulation_pct column NaN percentage:  2.95%
