In [1]:
import json
import pandas as pd
import pycountry

In [2]:
# Load the JSON data from the file
with open('population_lifeExpectancy_dataset.json', 'r') as f:
    data = json.load(f)
    
df = pd.read_csv(r'E:\University\Semester 5\DAV\Project\Dataset.csv')

In [3]:
# Function to get ISO3 code for a given country name
def get_iso3(country_name):
    try:
        # Use pycountry to get the country object
        country = pycountry.countries.get(name=country_name)
        if country:
            return country.alpha_3  # Return the ISO3 code
        else:
            return None  # Return None if no country is found
    except KeyError:
        return None  # Return None if there is an issue retrieving the country


In [4]:
# Create an empty list to store the final flattened data
flattened_data = []

# Define the new year range (2000-2019)
new_years = list(range(2000, 2020))

# Loop through all countries in the dataset
for country_data in data:
    country_name = country_data['name']
    region = country_data['region']
    
    # Get the ISO3 code for the country
    iso3_code = get_iso3(country_name)
    
    # Flatten the income, population, and lifeExpectancy data for the current country
    income_df = pd.DataFrame(country_data['income'], columns=['Year', 'Income'])
    population_df = pd.DataFrame(country_data['population'], columns=['Year', 'Population'])
    life_expectancy_df = pd.DataFrame(country_data['lifeExpectancy'], columns=['Year', 'LifeExpectancy'])
    
    # Merge the three DataFrames on the 'Year' column
    merged_df = income_df.merge(population_df, on='Year').merge(life_expectancy_df, on='Year')
    
    # Add the 'Country', 'Region', and 'ISO3' columns
    merged_df['Country'] = country_name
    merged_df['Region'] = region
    merged_df['ISO3'] = iso3_code
    
    # Drop the 'Income' column as we no longer need it
    merged_df = merged_df.drop(columns=['Income'])
    
    # Filter out the last 20 years of data for the current country
    last_20_years = merged_df.tail(20)
    
    # Replace the 'Year' column with the new year range (2000-2019)
    last_20_years['Year'] = new_years[:len(last_20_years)]  # Adjust the number of years if fewer than 20 rows exist
    
    # Ensure that 'Year' is an integer column, not float
    last_20_years['Year'] = last_20_years['Year'].astype(int)
    
    # Append the modified data to the final list
    flattened_data.append(last_20_years)

# Concatenate all country DataFrames into one large DataFrame
final_df = pd.concat(flattened_data, ignore_index=True)

# Reorder the columns for better readability
final_df = final_df[['Country', 'Region', 'ISO3', 'Year', 'Population', 'LifeExpectancy']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_20_years['Year'] = new_years[:len(last_20_years)]  # Adjust the number of years if fewer than 20 rows exist
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_20_years['Year'] = last_20_years['Year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_20_years['Year'] = new_years[

In [5]:
# Merge the two datasets on 'iso3' and 'year'
merged_data = pd.merge(df, final_df[['ISO3', 'Year', 'Population', 'LifeExpectancy']], 
                       how='inner', 
                       left_on=['iso3', 'year'], 
                       right_on=['ISO3', 'Year'])

# Drop unnecessary columns (e.g., duplicate columns from the merge)
merged_data = merged_data.drop(columns=['ISO3', 'Year'])

# Reorder the columns for better readability
merged_data = merged_data[['iso3', 'year', 'Tuberculosis_Deaths', 'Country', 'region', 'subregion', 'LifeExpectancy', 'Population']]

merged_data

Unnamed: 0,iso3,year,Tuberculosis_Deaths,Country,region,subregion,LifeExpectancy,Population
0,AFG,2000,6370.3470,Afghanistan,Asia,Southern Asia,41.15,14511868
1,AFG,2001,6116.2660,Afghanistan,Asia,Southern Asia,41.32,14669339
2,AFG,2002,5808.6480,Afghanistan,Asia,Southern Asia,41.46,14871963
3,AFG,2003,6188.5140,Afghanistan,Asia,Southern Asia,41.58,16317921
4,AFG,2004,5735.9200,Afghanistan,Asia,Southern Asia,41.67,18371583
...,...,...,...,...,...,...,...,...
2875,ZWE,2015,648.9602,Zimbabwe,Africa,Eastern Africa,41.28,12083553
2876,ZWE,2016,720.6268,Zimbabwe,Africa,Eastern Africa,41.68,12160782
2877,ZWE,2017,956.3198,Zimbabwe,Africa,Eastern Africa,42.40,12236805
2878,ZWE,2018,1046.2450,Zimbabwe,Africa,Eastern Africa,43.36,12311143


In [6]:
# Group by 'Country' to start restructuring
grouped_data = merged_data.groupby('Country')

# Prepare the final list to hold the restructured data
final_data = []

# Iterate over each country and restructure the data
for country, group in grouped_data:
    country_data = {
        "name": country,
        "region": group['region'].iloc[0],  
        "subregion": group['subregion'].iloc[0],
        "population": [],  
        "lifeExpectancy": [], 
        "Tubercolosis_Deaths": []  
        
    }
    
    # Loop through each row in the group (for the current country)
    for _, row in group.iterrows():
        country_data['population'].append([row['year'], row['Population']])
        country_data['lifeExpectancy'].append([row['year'], row['LifeExpectancy']])
        country_data['Tubercolosis_Deaths'].append([row['year'], row['Tuberculosis_Deaths']])
    
    final_data.append(country_data)

# Save the result to a JSON file
with open('final_data.json', 'w') as f:
    json.dump(final_data, f, indent=4)

print("Data has been restructured and saved to 'restructured_data.json'")


Data has been restructured and saved to 'restructured_data.json'


In [7]:
# print unique regions
print("Unique regions in the dataset:")
print(merged_data['region'].unique())

Unique regions in the dataset:
['Asia' 'Africa' 'Europe' 'Americas' 'Oceania']
