# CLEANING DATA IN PYTHON


# Constraints

In [2]:
import pandas as pd

data = pd.read_csv(r"c:/users/danhaya/arewads/arewads-fellowship/life_expectancy.csv")
print(data.iloc[1])

Country Name                                   Angola
Country Code                                      AGO
Region                             Sub-Saharan Africa
IncomeGroup                       Lower middle income
Year                                             2001
Life Expectancy World Bank                     47.059
Prevelance of Undernourishment                   67.5
CO2                                           15960.0
Health Expenditure %                         4.483516
Education Expenditure %                           NaN
Unemployment                                    4.004
Corruption                                        NaN
Sanitation                                        NaN
Injuries                                   1392080.71
Communicable                              11190210.53
NonCommunicable                            2663516.34
Name: 1, dtype: object


In [2]:
print(data.dtypes)

Country Name                       object
Country Code                       object
Region                             object
IncomeGroup                        object
Year                                int64
Life Expectancy World Bank        float64
Prevelance of Undernourishment    float64
CO2                               float64
Health Expenditure %              float64
Education Expenditure %           float64
Unemployment                      float64
Corruption                        float64
Sanitation                        float64
Injuries                          float64
Communicable                      float64
NonCommunicable                   float64
dtype: object


In [3]:
data.columns = data.columns.str.lower()
data.columns = data.columns.str.replace(' ', '_')
print(data.columns)

Index(['country_name', 'country_code', 'region', 'incomegroup', 'year',
       'life_expectancy_world_bank', 'prevelance_of_undernourishment', 'co2',
       'health_expenditure_%', 'education_expenditure_%', 'unemployment',
       'corruption', 'sanitation', 'injuries', 'communicable',
       'noncommunicable'],
      dtype='object')


In [11]:
cat_columns =['country_name', 'country_code', 'region', 'incomegroup']
data[cat_columns] = data[cat_columns].astype('category')

print(data.select_dtypes('category').head())

           country_name country_code                      region  \
0           Afghanistan          AFG                  South Asia   
1                Angola          AGO          Sub-Saharan Africa   
2               Albania          ALB       Europe & Central Asia   
3               Andorra          AND       Europe & Central Asia   
4  United Arab Emirates          ARE  Middle East & North Africa   

           incomegroup  
0           Low income  
1  Lower middle income  
2  Upper middle income  
3          High income  
4          High income  


# Data type constraints

In [7]:
print(data.head(10))

           country_name country_code                      region  \
0           Afghanistan          AFG                  South Asia   
1                Angola          AGO          Sub-Saharan Africa   
2               Albania          ALB       Europe & Central Asia   
3               Andorra          AND       Europe & Central Asia   
4  United Arab Emirates          ARE  Middle East & North Africa   
5             Argentina          ARG   Latin America & Caribbean   
6               Armenia          ARM       Europe & Central Asia   
7        American Samoa          ASM         East Asia & Pacific   
8   Antigua and Barbuda          ATG   Latin America & Caribbean   
9             Australia          AUS         East Asia & Pacific   

           incomegroup  year  life_expectancy_world_bank  \
0           Low income  2001                   56.308000   
1  Lower middle income  2001                   47.059000   
2  Upper middle income  2001                   74.288000   
3          

In [4]:
data['year'].dtype

dtype('int64')

In [8]:
data['year'] = pd.to_datetime(data['year']).dt.year

data['year'].dtype

dtype('int32')

In [9]:
data['incomegroup'].unique()

array(['Low income', 'Lower middle income', 'Upper middle income',
       'High income'], dtype=object)

In [14]:
data['incomegroup'].dtype

CategoricalDtype(categories=['High income', 'Low income', 'Lower middle income',
                  'Upper middle income'],
, ordered=False, categories_dtype=object)

In [25]:
data['incomegroup'] = data['incomegroup'].astype("category")

map_column = {
    'Low income' : 'Low',
    'Low middle income': 'Low',
    'Lower middle income': 'Low',
    'Upper middle income': 'Upper',
    'High income': 'Upper'
}

data['incomegroup'] = data['incomegroup'].replace(map_column)
print(data.value_counts('incomegroup'))

incomegroup
Upper    2014
Low      1292
Name: count, dtype: int64


In [26]:
g = data['incomegroup'].value_counts()
my_dict = g.to_dict()
print(my_dict) 

{'Upper': 2014, 'Low': 1292}


In [28]:
order_columns = ['Low', 'Upper']

data['incomegroup'] = data['incomegroup'].cat.reorder_categories(order_columns, ordered=True)
print(data['incomegroup'].value_counts())

incomegroup
Upper    2014
Low      1292
Name: count, dtype: int64
