## Data Acquisition

### Webscraping World Happiness

In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re

In [2]:
wrld_happiness_url = 'https://en.wikipedia.org/wiki/World_Happiness_Report'

response = requests.get(wrld_happiness_url)

In [3]:
response.status_code

200

In [4]:
page = response.text

In [5]:
soup = BeautifulSoup(page,"lxml")

In [6]:
tables = soup.find_all('table')

In [7]:
print(tables[0].prettify())

<table class="wikitable sortable">
 <tr valign="top">
  <th style="width: 10px;">
   Overall Rank
  </th>
  <th style="width: 10px;">
   Change in rank
  </th>
  <th style="width: 250px;">
   Country
  </th>
  <th>
   <abbr title="Happiness score">
    Score
   </abbr>
  </th>
  <th style="width: 10px;">
   Change in score
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: GDP">
    GDP per capita
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Social support">
    Social support
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Healthy life expectancy">
    Healthy life expectancy
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Freedom to make life choices">
    Freedom to make life choices
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Generosity">
    Generosity
   </abbr>
  </th>
  <th style="width: 10px;">
   <abbr title="Explained by: Perceptions of co

In [8]:
#rows=[row for row in tables[1].find_all('tr')]

In [9]:
#rows=rows[1:20]

In [10]:
#countries = {}
#for row in rows:
#    items=row.find_all('td')
#    country=items[1].find('a')['href']
#    countries[country]=[i.text for i in items[2:]]

In [11]:
#countries

In [12]:
happiness_2017 = pd.read_html(str(tables[0]), header = 0, index_col = 0)[0]

In [13]:
happiness_2017 = happiness_2017[happiness_2017.Country !='World']
happiness_2017 = happiness_2017[happiness_2017.Country != 'Europe']

In [14]:
happiness_2017.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156 entries, 1 to 155
Data columns (total 11 columns):
Change in rank                  153 non-null object
Country                         156 non-null object
Score                           156 non-null object
Change in score                 152 non-null float64
GDP per capita                  155 non-null float64
Social support                  155 non-null float64
Healthy life expectancy         155 non-null float64
Freedom to make life choices    155 non-null float64
Generosity                      155 non-null float64
Trust                           155 non-null float64
Residual                        155 non-null float64
dtypes: float64(8), object(3)
memory usage: 14.6+ KB


### Webscraping Country Fact URLs

Set URL and capture response. Check status of response.

In [15]:
wrld_fact_home_url = 'https://www.cia.gov/library/publications/resources/the-world-factbook/'
response = requests.get(wrld_fact_home_url)
response.status_code

200

Grab html and make a BeautifulSoup object with it.

In [16]:
page = response.text
soup = BeautifulSoup(page,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <!--<![endif]-->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="css/publications.css" rel="stylesheet" type="text/css"/>
  <link href="css/publications-detail.css" rel="stylesheet" type="text/css"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title>
   The World Factbook â Central Intelligence Agency
  </title>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="Apr 01, 2016" name="LastModified"/>
  <link href="css/jobcart.css" rel="stylesheet"/>
  <link href="css/smallscreen.css" rel="

Find html containing Country Name to url mappings and put them in a list.
This is in a dropdown menu on the page, with value equal to the relative url, and text equal to the Country Name.

In [17]:
cntry_select = soup.find(id='cntrySelect').find('select').find_all('option')
cntry_select

[<option value="">Please select a country to view</option>,
 <option value="geos/xx.html"> World </option>,
 <option value="geos/af.html"> Afghanistan </option>,
 <option value="geos/ax.html"> Akrotiri </option>,
 <option value="geos/al.html"> Albania </option>,
 <option value="geos/ag.html"> Algeria </option>,
 <option value="geos/aq.html"> American Samoa </option>,
 <option value="geos/an.html"> Andorra </option>,
 <option value="geos/ao.html"> Angola </option>,
 <option value="geos/av.html"> Anguilla </option>,
 <option value="geos/ay.html"> Antarctica </option>,
 <option value="geos/ac.html"> Antigua and Barbuda </option>,
 <option value="geos/xq.html"> Arctic Ocean </option>,
 <option value="geos/ar.html"> Argentina </option>,
 <option value="geos/am.html"> Armenia </option>,
 <option value="geos/aa.html"> Aruba </option>,
 <option value="geos/at.html"> Ashmore and Cartier Islands </option>,
 <option value="geos/zh.html"> Atlantic Ocean </option>,
 <option value="geos/as.html"> Au

Put these mappings into a dictionary, with the Country name as the key and the relative url as the value.

In [18]:
country_url={}
for option in cntry_select:
    country_url[option.text.strip()] = [option['value']]


We only want to include countries that are in the happiness_2017 dataframe.

In [19]:
country_url2 = pd.DataFrame.from_dict(country_url, orient='index')
country_url2 = country_url2[2:] # get rid of first two rows because they aren't countries
country_url2.index.name = 'Country'
country_url2.columns = ['url']
country_url2 = country_url2.reset_index()
country_url2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267 entries, 0 to 266
Data columns (total 2 columns):
Country    267 non-null object
url        267 non-null object
dtypes: object(2)
memory usage: 4.2+ KB


In [20]:
countries_merged = pd.merge(country_url2, happiness_2017, on='Country',how='outer')

We have some extra data!  

These observations can be split into three cases:

Case | Action | Description
-----| -------- |------------
1. | Keep | Observation has country name, url and features
2. | Need urls | Observation has country name, features but no url (See dataframe `need_urls`)
3. |Merge with another observation | Observation has country name, but no features or url, HOWEVER (upon inspection) the country name can be matched with one of the observations in `need_urls`. Merge the two.
4. | Delete | Observation does not have url or features, and cannot be matched with an observation in `need_urls`

#### Case 1: 
Put all observations that are in the first case in `countries`

In [21]:
countries_full = countries_merged[countries_merged['url'].notnull() & countries_merged['Change in score'].notnull()]
happiness_2017.shape, countries_full.shape

((156, 11), (144, 12))

So: 144 countries are in Case 1.

12 countries are split between Cases 2-4.

#### Case 2:
Find countries where an observation has country name and features, but no urls and assign these to `need_urls`.

In [22]:
need_urls = countries_merged[countries_merged['url'].isnull()]
need_urls

Unnamed: 0,Country,url,Change in rank,Score,Change in score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Trust,Residual
267,Czech Republic,,4,6.609,0.013,1.353,1.434,0.754,0.491,0.088,0.037,2.452
268,Europe[Note 1],,–,6.08,,,,,,,,
269,South Korea,,2,5.838,0.003,1.402,1.128,0.9,0.258,0.207,0.063,1.88
270,North Cyprus,,1,5.81,0.039,1.347,1.186,0.835,0.471,0.267,0.155,1.549
271,Palestinian Territories,,5,4.775,0.021,0.716,1.156,0.566,0.255,0.114,0.089,1.879
272,Myanmar,,5,4.545,0.15,0.367,1.123,0.398,0.514,0.838,0.189,1.115
273,Congo (Brazzaville),,1,4.291,0.019,0.809,0.832,0.29,0.435,0.121,0.08,1.724
274,Congo (Kinshasa),,1,4.28,0.044,0.092,1.229,0.191,0.236,0.246,0.06,2.225
275,Ivory Coast,,11,4.18,0.264,0.603,0.905,0.049,0.448,0.201,0.13,1.845


#### Case 3:
Find countries that don't have any values for features, and see if they're in `need_urls` under some other Country name.

In [23]:
no_feature_data = countries_merged[countries_merged['Change in score'].isnull()]

Upon inspection,  most of the Countries in `need_urls` match up with some in `no_feature_data`!

Country name in `need_urls` | Index in `need_urls`| Country name in `no_feature_data` | Index in `no_feature_data`
---------------|----------------
Czech Republic | 267 | Czechia | 66
South Korea    | 269 |  Korea, South | 131
Congo (Kinshasa) | 274 | Congo, Democratic Republic of the | 56
Congo (Brazzaville) | 273 | Congo, Repulic of the | 57
Myanmar | 272 | Burma | 40
Ivory Coast | 275 | Cote d'Ivoire | 61
Palestinian Territories | 271 | Gaza Strip, West Bank | 88, 261

* North Cyprus is not in `no_feature_data` because it is not recognized by cia.gov as a separate country.  (It became independent of Cyprus in 1960.]  Reading cia.gov's entry for Cyprus, 'individual Turkish Cypriots [residents in North Cyprus] able to document their eligibility for Republic of Cyprus citizenship legally enjoy the same rights accorded to other citizens [in Cyprus].' So we'll try to combine the data for North Cyprus and Cyprus where possible.


Next step: Merge the valid records from `need_urls` and `no_feature_data` and place them in `countries_full`.

In [24]:
need_urls_indexes = [267, 269, 274, 273, 272, 275]
no_feature_data_indexes = [66, 131, 56, 57, 40, 61]

for i in range(len(need_urls_indexes)):
    need_urls['url'][need_urls_indexes[i]]=no_feature_data['url'][no_feature_data_indexes[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Add those rows to `countries_full`.

In [25]:
countries_full = countries_full.append(need_urls.loc[need_urls_indexes])

#### Case 4: 
Reserve North Cyprus and Palestinian Territories to be treated separately later

In [26]:
special_case = need_urls.loc[[270,271]]
special_case

Unnamed: 0,Country,url,Change in rank,Score,Change in score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Trust,Residual
270,North Cyprus,,1,5.81,0.039,1.347,1.186,0.835,0.471,0.267,0.155,1.549
271,Palestinian Territories,,5,4.775,0.021,0.716,1.156,0.566,0.255,0.114,0.089,1.879


**So** `countries_full` contains only those that are in `happiness_2017` along with urls to get some more data!

Next step, go to each url in `countries_full` and gather more features.

### Webscraping Country Facts

#### Build functions to scrape features for each country

In [27]:
def url_to_soup(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    return soup 

In [28]:
def get_population(soup):
    #Scrape country's total population
    #Returns int
    
    rgx = re.compile(r'\.\.\/fields\/2119\.html#..')
    
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').strip())
    else:
        return None

In [29]:
def convert(string):
    try:
        return float(string)
    except:
        return string

In [30]:
get_population(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

34124811.0

In [31]:
def get_land_area(soup):
    #Scrape country's total land area
    #Returns int
    
    rgx = re.compile(r'\.\.\/fields\/2147\.html#..')    
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().findNextSibling().text.split('(')[0].replace(',','').split()[1])
    else:
        return None

In [32]:
get_land_area(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

652230.0

In [33]:
def get_support_ratio(soup):
    # Scrape country's support ratio.  (AKA Dependency Ratio) 
    # Definition: the number of working-age people (15-64) per one elderly person (65+)
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2261\.html#..') 
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().findNextSibling().findNextSibling().findNextSibling().text.split('(')[0].replace(',','').split()[3])
    else:
        return None

In [34]:
get_support_ratio(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

21.2

In [35]:
def get_urbanization(soup):
    # Scrape country's urban population.
    # Definition: the percentage of the total population living in urban areas, as defined by the country
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2212\.html#..')
    
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[2].replace('%',''))/100
    else:
        return None

In [36]:
get_urbanization(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

0.276

In [37]:
def get_median_mothers_age(soup):
    # Scrape country's median age of mothers at first birth
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2256\.html#..')
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0])
    else:
        return None

In [38]:
get_median_mothers_age(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

19.9

In [39]:
def get_median_age(soup):
    # Scrape country's median age
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2177\.html#..')
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[1])
    else:
        return None

In [40]:
get_median_age(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

18.8

In [41]:
def get_life_expectancy_at_birth(soup):
    # Scrape country's current life expectancy given to newborns
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2102\.html#..')
    if soup.find(href = rgx):
         return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[2])
    else:
        return None

In [42]:
get_life_expectancy_at_birth(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

51.7

In [43]:
def get_fertility_rate(soup):
    # Scrape country's average number of children born per woman
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2127\.html#..')
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0])
    else:
        return None

In [44]:
get_fertility_rate(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

5.12

In [45]:
def get_hospital_bed_density(soup):
    # Scrape country's hospital bed density
    # Equal to number of beds per 1000 people
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2227\.html#..')
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0])
    else:
        return None

In [46]:
get_hospital_bed_density(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

0.5

In [47]:
def get_access_to_clean_water(soup):
    # Scrape country's percentage of population with access to 'improved' water source
    # Definition : Improved drinking water - use of any of the following sources: piped water into dwelling, yard, or plot; public tap or standpipe; tubewell or borehole; protected dug well; protected spring; or rainwater collection. Unimproved drinking water - use of any of the following sources: unprotected dug well; unprotected spring; cart with small tank or drum; tanker truck; surface water, which includes rivers, dams, lakes, ponds, streams, canals or irrigation channels; or bottled water.
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2216\.html#..')
    if soup.find(href = rgx):
        #return convert(soup.find(href = rgx).parent.find(text = 'total').text.split('(')[0].replace(',','').split()[1].replace('%',''))/100
        #return soup.find(href = rgx).parent
        total = 'total'
        
        tries = 0
        
        query = soup.find(href = rgx).parent.findNextSibling()
        while tries < 5 and total not in query.text:
            tries += 1
            query = query.findNextSibling()
        return convert(query.text.split()[1].replace('%',''))/100
    else:
        return None

In [48]:
get_access_to_clean_water(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/ag.html'))

0.836

In [49]:
def get_obesity_rate(soup):
    # Scrape country's percentage of adult population that is considered to be obese (BMI >= 30)
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2228\.html#..')
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0].replace('%',''))/100
    else:
        return None

In [50]:
get_obesity_rate(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

0.055

In [51]:
def get_school_life_expectancy(soup):
    # Scrape country's average number of years for students to attend school (between primary and tertiary)
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2205\.html#..')
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[1])
    else:
        return None

In [52]:
get_school_life_expectancy(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

11.0

In [53]:
def get_gdp(soup):
    # Scrape country's GDP
    # A nation's GDP at purchasing power parity (PPP) exchange rates is the sum value of all goods and services produced in the country valued at prices prevailing in the United States in the year noted.
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2001\.html#..')
    
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0].replace('$',''))*10**9
    else:
        return None

In [54]:
get_gdp(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

69510000000.0

In [55]:
def get_gdp_per_capita(soup):
    # Scrape a country's GDP per capita
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2004\.html#..')
    if soup.find(href = rgx):
        return convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0].replace('$',''))
    else:
        return None

In [56]:
get_gdp_per_capita(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

1900.0

In [57]:
def get_unemployment_rate(soup):
    # Scrape country's unemployment rate
    # Returns float
    
    rgx = re.compile(r'\.\.\/fields\/2129\.html#..')
    if soup.find(href = rgx):
        ur = convert(soup.find(href = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0].replace('%',''))
        if isinstance(ur, str) == False:
            return ur/100
        else: return ur
    else:
        return None

In [58]:
get_unemployment_rate(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/ao.html'))

'NA'

In [59]:
def get_electrification_rate(soup):
    # Scrape percentage of country's population with electricity
    # Returns float
    
    rgx = re.compile(r'electrification - total population:')
    if soup.find(text = rgx):
        return convert(soup.find(text = rgx).parent.findNextSibling().text.split('(')[0].replace(',','').split()[0].replace('%',''))/100
    else:
        return None

In [60]:
get_electrification_rate(url_to_soup('https://www.cia.gov/library/publications/the-world-factbook/geos/af.html'))

0.43

In [61]:
def get_country_features(soup):
    features = {}
    features['Population'] = get_population(soup)
    features['Land_Area'] = get_land_area(soup)
    features['Support_Ratio'] = get_support_ratio(soup)
    features['Urbanization'] = get_urbanization(soup)
    features['Median_Mothers_Age'] = get_median_mothers_age(soup)
    features['Get_Median_Age'] = get_median_age(soup)
    features['Life_Expectancy'] = get_life_expectancy_at_birth(soup)
    features['Fertility_Rate'] = get_fertility_rate(soup)
    features['Hospital_Bed_Density'] = get_hospital_bed_density(soup)
    features['Access_To_Clean_Water'] = get_access_to_clean_water(soup)
    features['Obesity_Rate'] = get_obesity_rate(soup)
    features['School_Life_Expectancy'] = get_school_life_expectancy(soup)
    features['GDP'] = get_gdp(soup)
    features['GDP_per_capita'] = get_gdp_per_capita(soup)
    features['Unemployment_Rate'] = get_unemployment_rate(soup)
    features['Electrification_Rate'] = get_electrification_rate(soup)
    return features

In [62]:
world_factbook_url = 'https://www.cia.gov/library/publications/the-world-factbook/'

In [63]:
countries_full['Population'] = np.nan
countries_full['Land_Area'] = np.nan
countries_full['Support_Ratio'] = np.nan
countries_full['Urbanization'] = np.nan
countries_full['Median_Mothers_Age'] = np.nan
countries_full['Get_Median_Age'] = np.nan
countries_full['Life_Expectancy'] = np.nan
countries_full['Fertility_Rate'] = np.nan
countries_full['Hospital_Bed_Density'] = np.nan
countries_full['Access_To_Clean_Water'] = np.nan
countries_full['Obesity_Rate'] = np.nan
countries_full['School_Life_Expectancy'] = np.nan
countries_full['GDP'] = np.nan
countries_full['GDP_per_capita'] = np.nan
countries_full['Unemployment_Rate'] = np.nan
countries_full['Electrification_Rate'] = np.nan

#### Add additional features to `countries_full`

In [64]:
for index, row in countries_full.iterrows():
    url = world_factbook_url + row['url']
    soup = url_to_soup(url)
    features = get_country_features(soup)
    for feature in features:
        countries_full[feature][index] = features[feature]
    print(index, 'done')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0 done
2 done
3 done


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


6 done
11 done
12 done
16 done
17 done
18 done
20 done
22 done
24 done
25 done
26 done
27 done
29 done
30 done
31 done
32 done
34 done
38 done
39 done
41 done
43 done
44 done
45 done
48 done
49 done
50 done
54 done
60 done
62 done
65 done
67 done
71 done
72 done
73 done
74 done
77 done
78 done
82 done
83 done
86 done
89 done
90 done
91 done
93 done
97 done
99 done
102 done
105 done
106 done
108 done
109 done
110 done
112 done
113 done
114 done
115 done
117 done
118 done
119 done
121 done
125 done
126 done
127 done
132 done
133 done
134 done
136 done
137 done
139 done
140 done
142 done
143 done
145 done
146 done
147 done
148 done
150 done
151 done
153 done
154 done
155 done
158 done
160 done
161 done
163 done
165 done
168 done
169 done
171 done
172 done
173 done
174 done
178 done
181 done
184 done
187 done
188 done
189 done
191 done
192 done
194 done
195 done
196 done
197 done
208 done
209 done
210 done
212 done
213 done
215 done
216 done
218 done
219 done
222 done
223 done
225 done
226

In [65]:
countries_full.reset_index(drop=True)

Unnamed: 0,Country,url,Change in rank,Score,Change in score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,...,Life_Expectancy,Fertility_Rate,Hospital_Bed_Density,Access_To_Clean_Water,Obesity_Rate,School_Life_Expectancy,GDP,GDP_per_capita,Unemployment_Rate,Electrification_Rate
0,Afghanistan,geos/af.html,13,3.794,0.434,0.401,0.582,0.181,0.106,0.312,...,51.7,5.12,0.50,0.553,0.055,11.0,6.951000e+10,1900,0.35,0.430
1,Albania,geos/al.html,0,4.644,-0.011,0.996,0.804,0.731,0.381,0.201,...,78.5,1.51,2.60,0.836,0.217,16.0,3.587000e+10,12500,0.14,1.000
2,Algeria,geos/ag.html,-15,5.872,-0.483,1.092,1.146,0.618,0.233,0.069,...,77.0,2.70,,0.836,0.274,14.0,6.293000e+11,15100,0.117,0.990
3,Angola,geos/ao.html,1,3.795,-0.071,0.858,1.104,0.050,0.000,0.098,...,60.2,6.16,,0.490,0.082,10.0,1.920000e+11,6800,,0.300
4,Argentina,geos/ar.html,2,6.599,-0.051,1.185,1.440,0.695,0.495,0.109,...,77.3,2.26,4.70,0.991,0.283,17.0,9.115000e+11,20700,0.081,0.964
5,Armenia,geos/am.html,0,4.376,0.016,0.901,1.007,0.638,0.198,0.083,...,74.9,1.64,3.90,1.000,0.202,13.0,2.721000e+10,9100,0.189,1.000
6,Australia,geos/as.html,0,7.284,-0.029,1.484,1.510,0.844,0.602,0.478,...,82.3,1.77,3.90,1.000,0.290,20.0,1.235000e+09,49900,0.056,1.000
7,Austria,geos/au.html,-1,7.006,-0.113,1.487,1.460,0.815,0.568,0.316,...,81.6,1.47,7.60,1.000,0.201,16.0,4.341000e+11,49200,0.054,1.000
8,Azerbaijan,geos/aj.html,-4,5.234,-0.057,1.154,1.152,0.541,0.398,0.045,...,72.8,1.89,4.70,0.870,0.199,13.0,1.668000e+11,17400,0.06,1.000
9,Bahrain,geos/ba.html,1,6.087,-0.131,1.488,1.323,0.653,0.537,0.173,...,79.0,1.75,2.10,1.000,0.298,,6.977000e+10,51800,0.038,0.980


In [66]:
countries_full.describe()

Unnamed: 0,Change in score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Trust,Residual,Support_Ratio,Urbanization,Median_Mothers_Age,Get_Median_Age,Life_Expectancy,Fertility_Rate,Hospital_Bed_Density,Access_To_Clean_Water,Obesity_Rate,School_Life_Expectancy,GDP,Electrification_Rate
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,149.0,148.0,114.0,150.0,147.0,148.0,137.0,147.0,147.0,131.0,150.0,149.0
mean,0.006373,0.998753,1.199147,0.55954,0.409927,0.247507,0.123207,1.850013,11.999329,0.604615,24.223684,30.388,72.37619,2.598243,3.089781,0.883374,0.18451,13.572519,163302300000.0,0.794074
std,0.178811,0.411978,0.273863,0.228461,0.151343,0.136247,0.103005,0.504602,10.491306,0.225135,3.906661,9.116243,8.26009,1.330635,2.546369,0.152262,0.090407,3.13057,220497600000.0,0.317087
min,-0.834,0.023,0.396,0.006,0.0,0.0,0.0,0.378,2.3,0.083,17.9,15.4,50.6,0.83,0.1,0.317,0.021,5.0,1056000000.0,0.01
25%,-0.07625,0.7275,1.04925,0.399,0.3075,0.1535,0.057,1.6015,4.8,0.42925,20.575,22.55,66.65,1.635,1.1,0.821,0.0875,11.0,19927500000.0,0.66
50%,0.0,1.07,1.259,0.615,0.4385,0.232,0.089,1.83,10.2,0.6095,23.4,29.3,74.9,2.07,2.3,0.956,0.205,14.0,64300000000.0,0.99
75%,0.08775,1.3195,1.419,0.725,0.51975,0.32575,0.1525,2.148,16.8,0.7905,27.875,39.275,78.4,3.0625,4.6,0.9975,0.248,16.0,215225000000.0,1.0
max,0.497,1.871,1.611,0.949,0.658,0.838,0.464,3.117,83.4,1.0,31.0,47.3,85.3,6.49,13.7,1.0,0.379,20.0,926100000000.0,1.0


In [75]:
countries = countries_full.drop(["Change in score",'Change in rank','url', 'GDP per capita','Residual'], axis = 1)

ValueError: labels ['Change in score' 'GDP per capita' 'Residual'] not contained in axis