In [None]:
import pandas as pd #Import pandas library for data manipulation
import matplotlib.pyplot as plt #Import matplotlib for plotting
%matplotlib inline
import scipy.stats as stats
import numpy as np
import matplotlib.dates as mdates

#### CO2 Emmisions per Capita (2013)

In [None]:
co2_emissions = pd.read_csv('./Data-Files/CO2_Emissions.csv')


co2_2023 = co2_emissions[['Country Name', 'Country Code', '2023']]
co2_2023.columns = ['Country', 'Code', 'CO2_Emissions_2023']  # Rename columns

co2_2023.head()

Unnamed: 0,Country,Code,CO2_Emissions_2023
0,Aruba,ABW,4.936708
1,Africa Eastern and Southern,AFE,0.784641
2,Afghanistan,AFG,0.210043
3,Africa Western and Central,AFW,0.482217
4,Angola,AGO,0.768163


In [None]:
# Identify missing values in the 2023 CO2 emissions data
co2_2023[co2_2023['CO2_Emissions_2023'].isna()]

Unnamed: 0,Country,Code,CO2_Emissions_2023
6,Andorra,AND,
38,Channel Islands,CHI,
51,Curacao,CUW,
108,Isle of Man,IMN,
110,Not classified,INX,
137,Liechtenstein,LIE,
147,St. Martin (French part),MAF,
149,Monaco,MCO,
162,Montenegro,MNE,
196,West Bank and Gaza,PSE,


**Missing Data**

We cannot interpolate the null values and neither can we replace them with the mean. This is because the co2 emissions data are independent on the basis of country. additionally many other factors play a role in determining the amount of co2 emissions produced by a country. These factors have not been provided. Hence to deal with Missing data, we shall drop the null data objects.

In [None]:
co2_2023 = co2_2023.dropna(subset=['CO2_Emissions_2023']) #Drop rows with missing CO2 emissions data for 2023

co2_2023.head()

Unnamed: 0,Country,Code,CO2_Emissions_2023
0,Aruba,ABW,4.936708
1,Africa Eastern and Southern,AFE,0.784641
2,Afghanistan,AFG,0.210043
3,Africa Western and Central,AFW,0.482217
4,Angola,AGO,0.768163


In [None]:
mean_co2 = co2_2023['CO2_Emissions_2023'].mean() #Calculate the mean of the CO2 emissions data
std_co2 = co2_2023['CO2_Emissions_2023'].std()  #Calculate the standard deviation of the CO2 emissions data
median_co2 = co2_2023['CO2_Emissions_2023'].median() #Calculate the median of the CO2 emissions data
p_5_co2 = co2_2023['CO2_Emissions_2023'].quantile(0.05) #Calculate the 5th percentile of the CO2 emissions data
p_25_co2 = co2_2023['CO2_Emissions_2023'].quantile(0.25) #Calculate the 25th percentile of the CO2 emissions data
p_75_co2 = co2_2023['CO2_Emissions_2023'].quantile(0.75) #Calculate the 75th percentile of the CO2 emissions data
p_95_co2 = co2_2023['CO2_Emissions_2023'].quantile(0.95) #Calculate the 95th percentile of the CO2 emissions data

# Create a dictionary to hold the statistic names and their calculated values
summary_dict = {
    'Statistic': [
        'Mean', 'Standard Deviation', 'Median',
        '5th Percentile', '25th Percentile',
        '75th Percentile', '95th Percentile'
    ],
    'Value': [
        mean_co2, std_co2, median_co2,
        p_5_co2, p_25_co2,
        p_75_co2, p_95_co2
    ]
}

# Create the pandas DataFrame with the summary statistics dictionary
summary_df = pd.DataFrame(summary_dict)

print("Summary Statistics for CO2 Emissions in 2023")
summary_df.head(7)

Summary Statistics for CO2 Emissions in 2023


Unnamed: 0,Statistic,Value
0,Mean,4.458793
1,Standard Deviation,7.166779
2,Median,2.602447
3,5th Percentile,0.089705
4,25th Percentile,0.724544
5,75th Percentile,5.442896
6,95th Percentile,14.181638


#### Primary Completion Rate (2013)

In [None]:
primary_completion = pd.read_csv('./Data-Files/Primary_completion_rate.csv')

PC_2023 = primary_completion[['Country Name', 'Country Code', '2023']]
PC_2023.columns = ['Country', 'Code', 'Primary_Completion_2023']  # Rename columns


PC_2023.head()


Unnamed: 0,Country,Code,Primary_Completion_2023
0,Aruba,ABW,
1,Africa Eastern and Southern,AFE,
2,Afghanistan,AFG,
3,Africa Western and Central,AFW,68.049103
4,Angola,AGO,


In [None]:
NAN_count = PC_2023['Primary_Completion_2023'].isna().sum() #Countine the number of missing values
dataset_size = PC_2023['Primary_Completion_2023'].size

print(f"Number of missing values in Primary Completion Rate 2023: {NAN_count}")
print(f"Total number of entries in Primary Completion Rate 2023: {dataset_size}")

Number of missing values in Primary Completion Rate 2023: 146
Total number of entries in Primary Completion Rate 2023: 266


**Missing Data**

It is clear that over 50% of the data is missing, hence in this scenario, it would be more appropriate to populate the missing values with the mean of the present data


In [None]:
#Calculate the mean Primary Completion Rate for each country, ignoring NaN values and then fill NaN values with the calculated mean
PC_2023.loc[:, 'Primary_Completion_2023'] = PC_2023['Primary_Completion_2023'].fillna(PC_2023['Primary_Completion_2023'].mean())
PC_2023.head()

Unnamed: 0,Country,Code,Primary_Completion_2023
0,Aruba,ABW,88.501124
1,Africa Eastern and Southern,AFE,88.501124
2,Afghanistan,AFG,88.501124
3,Africa Western and Central,AFW,68.049103
4,Angola,AGO,88.501124


In [None]:
mean_PC = PC_2023['Primary_Completion_2023'].mean() #Calculate the mean of the Primary Completion Rate data
std_PC = PC_2023['Primary_Completion_2023'].std()  #Calculate the standard deviation of the Primary Completion Rate data
median_PC = PC_2023['Primary_Completion_2023'].median() #Calculate the median of the Primary Completion Rate data
p_5_PC = PC_2023['Primary_Completion_2023'].quantile(0.05) #Calculate the 5th percentile of the Primary Completion Rate data
p_25_PC = PC_2023['Primary_Completion_2023'].quantile(0.25) #Calculate the 25th percentile of the Primary Completion Rate data
p_75_PC = PC_2023['Primary_Completion_2023'].quantile(0.75) #Calculate the 75th percentile of the Primary Completion Rate data
p_95_PC = PC_2023['Primary_Completion_2023'].quantile(0.95) #Calculate the 95th percentile of the Primary Completion Rate data  

#create a dictionary to hold the statistic names and their calculated values
summary_dict_PC = {
    'Statistic': [
        'Mean', 'Standard Deviation', 'Median', '5th Percentile', '25th Percentile', '75th Percentile', '95th Percentile']
    , 'Value': [
        mean_PC, std_PC, median_PC, p_5_PC, p_25_PC, p_75_PC, p_95_PC]
}

summary_df_PC = pd.DataFrame(summary_dict_PC) #Create the pandas DataFrame with the summary statistics dictionary

print("Summary Statistics for Primary Completion Rate in 2023")
summary_df_PC.head(7)

Summary Statistics for Primary Completion Rate in 2023


Unnamed: 0,Statistic,Value
0,Mean,88.501124
1,Standard Deviation,9.851344
2,Median,88.501124
3,5th Percentile,68.640034
4,25th Percentile,88.501124
5,75th Percentile,90.167347
6,95th Percentile,102.764305
