# Part 2: Data Munging

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

  from pandas.core import datetools


## Open Webscraped Data

In [12]:
with open("countries_data_scraped.pkl", 'rb') as picklefile: 
    countries_data = pickle.load(picklefile)

In [13]:
countries_data.head()

Unnamed: 0,Country,url,Change in rank,Score,Change in score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,...,Life_Expectancy,Fertility_Rate,Hospital_Bed_Density,Access_To_Clean_Water,Obesity_Rate,School_Life_Expectancy,GDP,GDP_per_capita,Unemployment_Rate,Electrification_Rate
0,Afghanistan,geos/af.html,13,3.794,0.434,0.401,0.582,0.181,0.106,0.312,...,51.7,5.12,0.5,0.553,0.055,11.0,69510000000.0,1900.0,0.35,0.43
2,Albania,geos/al.html,0,4.644,-0.011,0.996,0.804,0.731,0.381,0.201,...,78.5,1.51,2.6,0.836,0.217,16.0,35870000000.0,12500.0,0.14,1.0
3,Algeria,geos/ag.html,-15,5.872,-0.483,1.092,1.146,0.618,0.233,0.069,...,77.0,2.7,,0.836,0.274,14.0,629300000000.0,15100.0,0.117,0.99
6,Angola,geos/ao.html,1,3.795,-0.071,0.858,1.104,0.05,0.0,0.098,...,60.2,6.16,,0.49,0.082,10.0,192000000000.0,6800.0,,0.3
11,Argentina,geos/ar.html,2,6.599,-0.051,1.185,1.44,0.695,0.495,0.109,...,77.3,2.26,4.7,0.991,0.283,17.0,911500000000.0,20700.0,0.081,0.964


## Data Munging

In [14]:
countries_data = countries_data.reset_index(drop=True)

In [15]:
countries_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 28 columns):
Country                         150 non-null object
url                             150 non-null object
Change in rank                  150 non-null object
Score                           150 non-null object
Change in score                 150 non-null float64
GDP per capita                  150 non-null float64
Social support                  150 non-null float64
Healthy life expectancy         150 non-null float64
Freedom to make life choices    150 non-null float64
Generosity                      150 non-null float64
Trust                           150 non-null float64
Residual                        150 non-null float64
Population                      149 non-null float64
Land_Area                       148 non-null float64
Support_Ratio                   149 non-null float64
Urbanization                    148 non-null float64
Median_Mothers_Age              114 non-null float6

Drop a few of the columns that we know we aren't interested in.  

We'll use data from the World Fact Book in place of the World Happiness Report's `GDP per capita` and `Healthy life expectancy`.

`Change in score`, `Change in rank` and `Residual` were all calculated in the World Happiness Report and have weird relationships with `Score`.




In [16]:
countries = countries_data.drop(["Change in score",'Change in rank','url', 'GDP per capita','Residual', 'Healthy life expectancy'], axis = 1)

Make sure all the entries in `Score` are numeric.

In [17]:
countries['Score'] = countries['Score'].apply(pd.to_numeric)

Create `Population Density` from `Population` and `Land_Area`.

In [18]:
countries['Population Density'] = countries['Population']/countries['Land_Area']

Fill `NaN`s with the mean of that column.

In [19]:
countries = countries.fillna(countries.mean())

In [20]:
with open('countries_data_munged.pkl', 'wb') as picklefile: # wb: write, binary
    pickle.dump(countries, picklefile) #dump data into pickle file