In [1]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

sns.set_style(style='darkgrid')
%matplotlib inline
warnings.filterwarnings(action='ignore')

In [4]:
# Getting the data
data = pd.read_csv("./data/raw/FIN_DATA.csv")

In [5]:
# Getting a copy of the data to work with
fin_data_df = data.copy()

In [6]:
# Looking at the head of the data
fin_data_df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [7]:
# Getting the information of the data
fin_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


# Data Wrangling

In [8]:
# checking for null values in the dataset
fin_data_df.isnull().sum()

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [12]:
# Checking for duplicated values
fin_data_df.duplicated().sum()

0

In [13]:
# dropping the id column since it will not be used for any analyses
fin_data_df = fin_data_df.drop(columns='uniqueid')

In [16]:
# Convert 'year' to datetime
fin_data_df['year'] = pd.to_datetime(fin_data_df['year'], format='%Y')

In [20]:
# Creating a function to group ages
def age_group(age):
    if age < 18:
        return '1-17'
    elif 18 < age <= 35:
        return '18-35'
    elif 35 < age <= 60:
        return '36-60'
    else:
        return '60+'


# Creating a new column called age_group
fin_data_df['age_group'] = fin_data_df['age_of_respondent'].apply(age_group)

In [26]:
fin_data_df.nunique()

country                    4
year                       3
bank_account               2
location_type              2
cellphone_access           2
household_size            20
age_of_respondent         85
gender_of_respondent       2
relationship_with_head     6
marital_status             5
education_level            6
job_type                  10
age_group                  4
dtype: int64