In [1]:
import pandas as pd


In [12]:
# Loading the CSV files
file_paths = {
    'Current_Health_Expenditure': 'data/Current health expenditure.csv',
    'Electoral_Democracy_Index': 'data/electoral-democracy-index.csv',
    'GDP': 'data/gdp.csv',
    'Gini_Index': 'data/Gini index.csv',
    'Obesity': 'data/obesity.csv',
    'Life_Expectancy_Females': 'data/remaining-life-expectancy-at-different-ages-females.csv',
    'Life_Expectancy_Males': 'data/remaining-life-expectancy-at-different-ages-males.csv',
    'Safe_Water': 'data/SafeWater.csv',
    'Alcohol_Consumption': 'data/Total alcohol consumption per capita.csv',
    'Government_Education_Expenditure': 'data/total-government-expenditure-on-education-gdp.csv',
    'Electricity_Access': 'data/Access to electricity.csv',
    'Meat_Supply': 'data/meat-supply-per-person.csv',
    'Government_Education_Expenditure_New': 'data/total-government-expenditure-on-education-gdp.csv'
}

# Load and store each CSV file into a dictionary of dataframes
dataframes = {name: pd.read_csv(path) for name, path in file_paths.items()}



#### Transofrom and merge data

In [23]:

# Rename 'Country Name' and 'Country Code' to 'Entity' and 'Code'
columns_to_rename = ['Safe_Water', 'Alcohol_Consumption', 'Gini_Index', 'Current_Health_Expenditure', 'Electricity_Access']
for name in columns_to_rename:
    dataframes[name] = dataframes[name].rename(columns={'Country Name': 'Entity', 'Country Code': 'Code'})

# Function to transform year format
def transform_year_format(df):
    # Melt the dataframe - convert from wide to long format
    df_long = df.melt(id_vars=['Entity', 'Code'], var_name='Year')
    
    # Extract the year part from "Year" column (e.g., "1960 [YR1960]" to "1960")
    df_long['Year'] = df_long['Year'].str.extract('(\d{4})').astype(int)
    
    return df_long

# Apply the transformation to the specific dataframes
for name in columns_to_rename:
    dataframes[name] = transform_year_format(dataframes[name])

# Apply the transformation to the specific dataframes
dataframes_to_transform = ['Safe_Water', 'Alcohol_Consumption', 'Gini_Index', 'Current_Health_Expenditure', 'Electricity_Access']
for name in dataframes_to_transform:
    dataframes[name] = transform_year_format(dataframes[name])



# Transform specific dataframes with year as a variable
dataframes_to_transform = ['Safe_Water', 'Alcohol_Consumption', 'Gini_Index', 'Current_Health_Expenditure', 'Electricity_Access']
for name in dataframes_to_transform:
    dataframes[name] = transform_year_format(dataframes[name])


# Transform life expectancy dataframes to long format
def transform_life_expectancy(df, sex):
    df_long = df.melt(id_vars=['Entity', 'Code', 'Year'], var_name='Age_Sex', value_name='Remaining_Life_Expectancy')
    df_long['Sex'] = sex
    df_long['Age'] = df_long['Age_Sex'].str.extract('(\d+)').astype(int)
    df_long.drop(columns=['Age_Sex'], inplace=True)
    return df_long

df_life_expectancy_females_long = transform_life_expectancy(dataframes['Life_Expectancy_Females'], 'Female')
df_life_expectancy_males_long = transform_life_expectancy(dataframes['Life_Expectancy_Males'], 'Male')

# Combine the transformed life expectancy dataframes
df_life_expectancy_combined = pd.concat([df_life_expectancy_females_long, df_life_expectancy_males_long])

# Duplicate rows for non-gender-specific datasets, one for each gender
non_gender_specific_dfs = []
for name, df in dataframes.items():
    if df is not dataframes['Life_Expectancy_Females'] and df is not dataframes['Life_Expectancy_Males']:
        df_male = df.copy()
        df_female = df.copy()
        df_male['Sex'] = 'Male'
        df_female['Sex'] = 'Female'
        combined_df = pd.concat([df_male, df_female])
        non_gender_specific_dfs.append(combined_df)

# Combine all non-gender-specific datasets into one dataframe
df_combined_non_gender = pd.concat(non_gender_specific_dfs)

# Merge the transformed life expectancy dataframe with the non-gender-specific dataframes
df_final_merged = pd.merge(df_life_expectancy_combined, df_combined_non_gender, on=['Entity', 'Code', 'Year', 'Sex'], how='left')

# Save the final merged dataframe to a new CSV file
df_final_merged.to_csv('data/final_merged_dataframe.csv', index=False)

ValueError: cannot convert float NaN to integer

In [18]:
df_final_merged.sample(15)

Unnamed: 0,Entity,Code,Year,Remaining_Life_Expectancy,Sex,Age,Value,Electoral democracy index,electdem_vdem_high_owid,electdem_vdem_low_owid,"GDP, PPP (constant 2017 international $)","Indicator:Prevalence of obesity among adults, BMI &GreaterEqual; 30 (crude estimate) (%) - Sex:Both sexes",Historical and more recent expenditure estimates,"Meat, total | 00002943 || Food available for consumption | 0645pc || kilograms per year per capita"
531328,Tajikistan,TJK,1999,28.8536,Female,45,..,,,,,,,
871565,Andorra,AND,1996,68.9522,Male,10,..,,,,,,,
1510979,Togo,TGO,1993,4.2403,Male,80,,,,,5460615000.0,,,
281614,Mexico,MEX,1971,56.249,Female,15,..,,,,,,,
1502530,Slovenia,SVN,2005,6.7888,Male,80,,,,,,17.9,,
1178662,South Sudan,SSD,1966,22.5419,Male,25,,0.206,0.228,0.189,,,,
590702,Iraq,IRQ,2019,15.2264,Female,65,99.9910278320313,,,,,,,
1375182,New Caledonia,NCL,2013,17.8919,Male,65,..,,,,,,,
464821,El Salvador,SLV,2015,34.6043,Female,45,,,,,,,3.90988,
1432238,Chad,TCD,2000,4.7917,Male,80,2.51123,,,,,,,


#### cleansing the  data

In [19]:
# Renaming columns to shorter and more meaningful names
renamed_columns = {
    'Remaining_Life_Expectancy': 'Life_Expectancy',
    'Electoral democracy index': 'Democracy_Index',
    'electdem_vdem_high_owid': 'Democracy_Index_High',
    'electdem_vdem_low_owid': 'Democracy_Index_Low',
    'GDP, PPP (constant 2017 international $)': 'GDP_PPP',
    'Indicator:Prevalence of obesity among adults, BMI &GreaterEqual; 30 (crude estimate) (%) - Sex:Both sexes': 'Obesity_Rate',
    'Historical and more recent expenditure estimates': 'Expenditure_Estimates',
    'Meat, total | 00002943 || Food available for consumption | 0645pc || kilograms per year per capita': 'Meat_Supply_per_Capita'
}

df.rename(columns=renamed_columns, inplace=True)

# Replace '..' with NaN
df.replace('..', pd.NA, inplace=True)

# Convert columns that should be numeric but are currently objects due to the '..' values
numeric_columns = ['Value', 'Democracy_Index', 'Democracy_Index_High', 'Democracy_Index_Low', 
                    'GDP_PPP', 'Obesity_Rate', 'Expenditure_Estimates', 'Meat_Supply_per_Capita']

 for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

 df.head()



KeyError: 'Value'

#### descriptive Analysis