In [1]:
%matplotlib inline
import pandas as pd

In [2]:
#imports for Python and Pandas

import json
from pandas.io.json import json_normalize

In [3]:
#use normalize to create a Pandas dataframe out of the JSON file

df = pd.read_json('world_bank_projects.json') 

# Question 1 :
+ Find the 10 countries with most projects

In [4]:
#output countires with most projects, using head() to limit to top ten

df.countryshortname.value_counts().head(10)

Indonesia             19
China                 19
Vietnam               17
India                 16
Yemen, Republic of    13
Nepal                 12
Bangladesh            12
Morocco               12
Africa                11
Mozambique            11
Name: countryshortname, dtype: int64

In [5]:
#create a new dataframe that excludes Africa which is not a country.

df_countries = df[df.countryshortname != ("Africa")]

## Answer 1

In [6]:
print('Top 10 Countries with Most Projects')
df_countries.countryshortname.value_counts().head(10)

Top 10 Countries with Most Projects


Indonesia             19
China                 19
Vietnam               17
India                 16
Yemen, Republic of    13
Nepal                 12
Bangladesh            12
Morocco               12
Mozambique            11
Pakistan               9
Name: countryshortname, dtype: int64

# Question 2: 
+ Find the top 10 major project themes

In [7]:
#examine mjtheme_namecode and find that we have nested fields 
df.mjtheme_namecode.head()

0    [{'code': '8', 'name': 'Human development'}, {...
1    [{'code': '1', 'name': 'Economic management'},...
2    [{'code': '5', 'name': 'Trade and integration'...
3    [{'code': '7', 'name': 'Social dev/gender/incl...
4    [{'code': '5', 'name': 'Trade and integration'...
Name: mjtheme_namecode, dtype: object

In [8]:
#load in the json file
data = json.load((open('world_bank_projects.json')))
# use normalize to create a dataframe of project codes and names, countries, and id's
df_themes = json_normalize(data, 'mjtheme_namecode', ['countryshortname', 'id'])
df_themes.head(10)

Unnamed: 0,code,name,countryshortname,id
0,8,Human development,Ethiopia,P129828
1,11,,Ethiopia,P129828
2,1,Economic management,Tunisia,P144674
3,6,Social protection and risk management,Tunisia,P144674
4,5,Trade and integration,Tuvalu,P145310
5,2,Public sector governance,Tuvalu,P145310
6,11,Environment and natural resources management,Tuvalu,P145310
7,6,Social protection and risk management,Tuvalu,P145310
8,7,Social dev/gender/inclusion,"Yemen, Republic of",P144665
9,7,Social dev/gender/inclusion,"Yemen, Republic of",P144665


Here we can see that some project id's have multiple entries for same code/name

In [9]:
#remove duplicates where the same code appears more than once in any project
df_themes = df_themes.drop_duplicates(['id', 'code'], keep='first')
df_themes.head(10)

Unnamed: 0,code,name,countryshortname,id
0,8,Human development,Ethiopia,P129828
1,11,,Ethiopia,P129828
2,1,Economic management,Tunisia,P144674
3,6,Social protection and risk management,Tunisia,P144674
4,5,Trade and integration,Tuvalu,P145310
5,2,Public sector governance,Tuvalu,P145310
6,11,Environment and natural resources management,Tuvalu,P145310
7,6,Social protection and risk management,Tuvalu,P145310
8,7,Social dev/gender/inclusion,"Yemen, Republic of",P144665
10,5,Trade and integration,Lesotho,P144933


In [10]:
#output top ten themes
df_themes.name.value_counts().head(10)

Environment and natural resources management    135
Rural development                               135
Public sector governance                        126
Human development                               118
Social protection and risk management           110
                                                107
Financial and private sector development        104
Social dev/gender/inclusion                     103
Trade and integration                            56
Urban development                                37
Name: name, dtype: int64

## Answer 2

In [11]:
print('Top 10 Major Project Themes')
df_themes.name.value_counts().head(11) #accounting for missing names

Top 10 Major Project Themes


Environment and natural resources management    135
Rural development                               135
Public sector governance                        126
Human development                               118
Social protection and risk management           110
                                                107
Financial and private sector development        104
Social dev/gender/inclusion                     103
Trade and integration                            56
Urban development                                37
Economic management                              28
Name: name, dtype: int64

We can see there are entires without names. This will be addressed in the next problem.

# Question 3: 
+ Create a new dataframe with missing project names from answer 2 filled in

In [12]:
#start by setting 'code' as our index
code_df = df_themes.set_index(['code']).sort_index()
code_df.head(10)

Unnamed: 0_level_0,name,countryshortname,id
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Economic management,Ukraine,P131234
1,Economic management,Lao People's Democratic Republic,P129825
1,Economic management,Seychelles,P132425
1,Economic management,Armenia,P132948
1,Economic management,Sao Tome and Principe,P130925
1,Economic management,Uruguay,P131440
1,Economic management,Poland,P130459
1,Economic management,"Macedonia, former Yugoslav Republic of",P133791
1,Economic management,Pacific Islands,P133255
1,Economic management,"Yemen, Republic of",P143819


In [13]:
#fill in the missing names using ffill
code_df['name'] = code_df.name.replace('', method='ffill')
code_df

Unnamed: 0_level_0,name,countryshortname,id
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Economic management,Ukraine,P131234
1,Economic management,Lao People's Democratic Republic,P129825
1,Economic management,Seychelles,P132425
1,Economic management,Armenia,P132948
1,Economic management,Sao Tome and Principe,P130925
1,Economic management,Uruguay,P131440
1,Economic management,Poland,P130459
1,Economic management,"Macedonia, former Yugoslav Republic of",P133791
1,Economic management,Pacific Islands,P133255
1,Economic management,"Yemen, Republic of",P143819


In [14]:
#reset index
themes_df = code_df.reset_index()

In [15]:
#group by project name and sort to gove top ten
top_themes = themes_df.groupby(['name']).size().sort_values(ascending=False).head(10)

## Answer 3

In [16]:
print('Top 10 Major Project Themes: ')
top_themes

Top 10 Major Project Themes: 


name
Environment and natural resources management    157
Rural development                               148
Public sector governance                        140
Human development                               127
Financial and private sector development        119
Social protection and risk management           116
Social dev/gender/inclusion                     115
Trade and integration                            61
Urban development                                40
Economic management                              33
dtype: int64

We can see that after removing duplicate entries and filling in missing names that several of the themes have switched places from our answer in problem 2.