In [None]:
# required packages
import pandas as pd
import numpy as np

In [2]:
# Loading data
raw_data = pd.read_csv('Datasets_MS_Project/Annual_population/Population_E_All_Data_(Normalized)/Population_E_All_Data_(Normalized).csv')
raw_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1950,1950,1000 No,7776.176,X,
1,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1951,1951,1000 No,7879.339,X,
2,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1952,1952,1000 No,7987.783,X,
3,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1953,1953,1000 No,8096.698,X,
4,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1954,1954,1000 No,8207.95,X,


In [3]:
raw_data.shape

(168689, 13)

In [4]:
raw_data['Item'].unique()

array(['Population - Est. & Proj.'], dtype=object)

In [5]:
raw_data['Element'].unique()

array(['Total Population - Both sexes', 'Total Population - Male',
       'Total Population - Female', 'Rural population',
       'Urban population'], dtype=object)

This dataset regarding annual population is simple. There is just one item and 
five elements. Out of the five elements, we would filter the data for three- 
total population, rural population, and urban population. For the kind of analysis 
we are doing, we don't need to go into details like male and female populations 
separately.

In [6]:
elements_to_keep = ['Total Population - Both sexes', 'Rural population',
       'Urban population']
filtered_data = raw_data.loc[raw_data['Element'].isin(elements_to_keep)]
filtered_data.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1950,1950,1000 No,7776.176,X,
1,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1951,1951,1000 No,7879.339,X,
2,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1952,1952,1000 No,7987.783,X,
3,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1953,1953,1000 No,8096.698,X,
4,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1954,1954,1000 No,8207.95,X,


In [8]:
filtered_data['Element'].unique()

array(['Total Population - Both sexes', 'Rural population',
       'Urban population'], dtype=object)

In [9]:
filtered_data['Element'].value_counts()

Element
Total Population - Both sexes    39049
Rural population                 25771
Urban population                 25771
Name: count, dtype: int64

In [11]:
filtered_data['Unit'].unique()

array(['1000 No'], dtype=object)

In [None]:
# restructuring data from long-format to wide-format
pivoted_data = filtered_data.pivot_table(
    index = ['Area Code', 'Area', 'Year Code', 'Year'],
    columns = 'Element',
    values = 'Value'
)

# resetting row index
pivoted_data.reset_index(inplace=True)

# setting column index name to None
pivoted_data.columns.name = None

pivoted_data.head()

Unnamed: 0,Area Code,Area,Year Code,Year,Rural population,Total Population - Both sexes,Urban population
0,1,Armenia,1992,1992,1140.314,3571.861,2302.496
1,1,Armenia,1993,1993,1123.071,3453.332,2240.027
2,1,Armenia,1994,1994,1105.535,3364.61,2178.125
3,1,Armenia,1995,1995,1092.052,3307.581,2125.29
4,1,Armenia,1996,1996,1084.134,3278.735,2084.081


In [12]:
pivoted_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39251 entries, 0 to 39250
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Area Code                      39251 non-null  int64  
 1   Area                           39251 non-null  object 
 2   Year Code                      39251 non-null  int64  
 3   Year                           39251 non-null  int64  
 4   Rural population               25771 non-null  float64
 5   Total Population - Both sexes  39049 non-null  float64
 6   Urban population               25771 non-null  float64
dtypes: float64(3), int64(3), object(1)
memory usage: 2.1+ MB


In [14]:
# renaming columns
cleaned_data = pivoted_data.rename(
    columns = {
        'Area Code': 'area_code',
        'Area': 'area',
        'Year Code': 'year_code',
        'Year': 'year',
        'Rural population': 'rural_population',
        'Total Population - Both sexes': 'total_population',
        'Urban population': 'urban_population'
    }
)
cleaned_data.head()

Unnamed: 0,area_code,area,year_code,year,rural_population,total_population,urban_population
0,1,Armenia,1992,1992,1140.314,3571.861,2302.496
1,1,Armenia,1993,1993,1123.071,3453.332,2240.027
2,1,Armenia,1994,1994,1105.535,3364.61,2178.125
3,1,Armenia,1995,1995,1092.052,3307.581,2125.29
4,1,Armenia,1996,1996,1084.134,3278.735,2084.081


In [16]:
# exporting cleaned data as csv file
cleaned_data.to_csv('cleaned_datasets/annual_population_cleaned.csv', index='False')