## 1- Look at the big picture and frame the problem.

In [1]:
# Install required library
%pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting kaggle (from opendatasets)
  Downloading kaggle-1.5.16.tar.gz (83 kB)
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     ---------------------------------------

In [4]:
# Import libraries

import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od

## 2- Load the dataset

In [None]:
# Download the dataset
url = "https://www.kaggle.com/datasets/moazzimalibhatti/co2-emission-by-countries-year-wise-17502022"
od.download(url)

In [5]:
# Load the dataset

file_path = "./co2-emission-by-countries-year-wise-17502022/CO2 emission by countries.csv"
countries_emissions = pd.read_csv(file_path, encoding='latin1')

### 2.1 Look at the data structure

In [7]:
countries_emissions.head(25)

Unnamed: 0,Country,Code,Calling Code,Year,CO2 emission (Tons),Population(2022),Area,% of World,Density(km2)
0,Afghanistan,AF,93,1750,0.0,41128771.0,652230.0,0.40%,63/km²
1,Afghanistan,AF,93,1751,0.0,41128771.0,652230.0,0.40%,63/km²
2,Afghanistan,AF,93,1752,0.0,41128771.0,652230.0,0.40%,63/km²
3,Afghanistan,AF,93,1753,0.0,41128771.0,652230.0,0.40%,63/km²
4,Afghanistan,AF,93,1754,0.0,41128771.0,652230.0,0.40%,63/km²
5,Afghanistan,AF,93,1755,0.0,41128771.0,652230.0,0.40%,63/km²
6,Afghanistan,AF,93,1756,0.0,41128771.0,652230.0,0.40%,63/km²
7,Afghanistan,AF,93,1757,0.0,41128771.0,652230.0,0.40%,63/km²
8,Afghanistan,AF,93,1758,0.0,41128771.0,652230.0,0.40%,63/km²
9,Afghanistan,AF,93,1759,0.0,41128771.0,652230.0,0.40%,63/km²


In [4]:
countries_emissions.describe()

Unnamed: 0,Year,CO2 emission (Tons),Population(2022),Area
count,59620.0,59620.0,53116.0,55284.0
mean,1885.0,1034774000.0,39922600.0,652207.3
std,78.231085,10416520000.0,148236500.0,1865483.0
min,1750.0,0.0,11312.0,21.0
25%,1817.0,0.0,1770414.0,17704.5
50%,1885.0,0.0,8673095.0,110381.5
75%,1953.0,8715092.0,28629200.0,492573.0
max,2020.0,417000000000.0,1425887000.0,17098240.0


In [5]:
# Check the number of non-null values
countries_emissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59620 entries, 0 to 59619
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Country              59620 non-null  object 
 1   Code                 57452 non-null  object 
 2   Calling Code         56097 non-null  object 
 3   Year                 59620 non-null  int64  
 4   CO2 emission (Tons)  59620 non-null  float64
 5   Population(2022)     53116 non-null  float64
 6   Area                 55284 non-null  float64
 7   % of World           55284 non-null  object 
 8   Density(km2)         53116 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 4.1+ MB


## 3 Clean the data

In [6]:
# Check for duplicate values
countries_emissions.duplicated().sum()

0

In [8]:
# Find the number of missing values in each column
countries_emissions.isna().sum()

Country                   0
Code                   2168
Calling Code           3523
Year                      0
CO2 emission (Tons)       0
Population(2022)       6504
Area                   4336
% of World             4336
Density(km2)           6504
dtype: int64

In [9]:
# Find the total number of '0' values in each column
(countries_emissions == 0).sum()

Country                    0
Code                       0
Calling Code               0
Year                       0
CO2 emission (Tons)    37913
Population(2022)           0
Area                       0
% of World                 0
Density(km2)               0
dtype: int64

In [12]:
# Drop the rows with value '0'
countries_emissions_filtered = countries_emissions.loc[countries_emissions["CO2 emission (Tons)"] != 0]

# Display the resulting DataFrame
countries_emissions_filtered

Unnamed: 0,Country,Code,Calling Code,Year,CO2 emission (Tons),Population(2022),Area,% of World,Density(km2)
199,Afghanistan,AF,93,1949,14656.0,41128771.0,652230.0,0.40%,63/km²
200,Afghanistan,AF,93,1950,98928.0,41128771.0,652230.0,0.40%,63/km²
201,Afghanistan,AF,93,1951,190528.0,41128771.0,652230.0,0.40%,63/km²
202,Afghanistan,AF,93,1952,282128.0,41128771.0,652230.0,0.40%,63/km²
203,Afghanistan,AF,93,1953,388384.0,41128771.0,652230.0,0.40%,63/km²
...,...,...,...,...,...,...,...,...,...
59615,Zimbabwe,ZW,263,2016,736467042.0,16320537.0,390757.0,0.30%,42/km²
59616,Zimbabwe,ZW,263,2017,746048675.0,16320537.0,390757.0,0.30%,42/km²
59617,Zimbabwe,ZW,263,2018,757903042.0,16320537.0,390757.0,0.30%,42/km²
59618,Zimbabwe,ZW,263,2019,768852126.0,16320537.0,390757.0,0.30%,42/km²


In [13]:
# Find the number of missing values
countries_emissions_filtered.isna().sum()

Country                   0
Code                    747
Calling Code           1059
Year                      0
CO2 emission (Tons)       0
Population(2022)       1561
Area                   1038
% of World             1038
Density(km2)           1561
dtype: int64