# Load the dataset

In [None]:
import pandas as pd

raw_df = pd.read_excel(
    "../1_datasets/raw_data/IRENA_OFGStats.raw.xlsx", sheet_name="data"
)
df = raw_df.copy()
df.head()

Unnamed: 0,Region,UN Sub-region,Country,IRENA Label,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Product Code,DataType,Value,Unit,Year,Ptype,Publication
0,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2000,220,2024 OFG RE Statistics
1,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2001,220,2024 OFG RE Statistics
2,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2002,220,2024 OFG RE Statistics
3,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2003,220,2024 OFG RE Statistics
4,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2004,220,2024 OFG RE Statistics


In [27]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52343 entries, 0 to 52342
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Region            52343 non-null  object 
 1   UN Sub-region     52343 non-null  object 
 2   Country           52343 non-null  object 
 3   IRENA Label       52343 non-null  object 
 4   ISO Code          52343 non-null  object 
 5   Flow              52343 non-null  object 
 6   Group Technology  52343 non-null  object 
 7   Sub-Technology    52343 non-null  object 
 8   Technology        52343 non-null  object 
 9   Product Code      51205 non-null  float64
 10  DataType          52343 non-null  object 
 11  Value             52321 non-null  float64
 12  Unit              52343 non-null  object 
 13  Year              52343 non-null  int64  
 14  Ptype             52343 non-null  int64  
 15  Publication       52343 non-null  object 
dtypes: float64(2), int64(2), object(12)
memo

(52343, 16)

In [28]:
df.isnull().sum()

Region                 0
UN Sub-region          0
Country                0
IRENA Label            0
ISO Code               0
Flow                   0
Group Technology       0
Sub-Technology         0
Technology             0
Product Code        1138
DataType               0
Value                 22
Unit                   0
Year                   0
Ptype                  0
Publication            0
dtype: int64

- **It's clear that there are a lot of null values in the product Code but the column itself is not relevant to our study so it will be dropped along with: UN Sub-region, IRENA Label, Ptype, and publication**

In [29]:
df.drop(
    [
        "UN Sub-region",
        "IRENA Label",
        "Ptype",
        "Publication",
        "Product Code",
        "DataType",
    ],
    axis=1,
    inplace=True,
)

In [30]:
df.head()

Unnamed: 0,Region,Country,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Value,Unit,Year
0,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2000
1,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2001
2,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2002
3,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2003
4,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2004


- **Now, we should handle the 22 missing values in the "Value" Column**

In [31]:
missing_rows = df[df["Value"].isnull()]
missing_rows

Unnamed: 0,Region,Country,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Value,Unit,Year
39940,Central America and the Caribbean,Bahamas (the),BHS,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
39946,Central America and the Caribbean,Belize,BLZ,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
39957,Central America and the Caribbean,Costa Rica,CRI,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
39971,Central America and the Caribbean,Dominica,DMA,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40021,Central America and the Caribbean,Nicaragua,NIC,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40035,Central America and the Caribbean,Puerto Rico,PRI,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40082,South America,Brazil,BRA,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40089,South America,Chile,CHL,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40141,Africa,Egypt,EGY,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40169,Africa,Sudan (the),SDN,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023


- **So, it's apparent that most of the missing values here are from countries out of our scope, so it's better to just fill the null values with zeros**

# Filling the missing values with zeros

In [36]:
df2 = df.fillna(value=0)
df2.isnull().sum().sum()

0

- **Now, The resultant dataset will be saved to "cleaned_data" sub-folder in the "1_dataset" folder**

In [37]:
df2.to_excel(
    "../1_datasets/cleaned_data/IRENA_OFGStats.cleaned.xlsx",
    sheet_name="Cleaned_data",
    index=False,
)