In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import Image, display

#Make the graphs a bit prettier
plt.style.use('ggplot')

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
# Data
df2015 = pd.read_csv("data/happiness/2015.csv")
df2016 = pd.read_csv("data/happiness/2016.csv")
df2017 = pd.read_csv("data/happiness/2017.csv")
df2018 = pd.read_csv("data/happiness/2018.csv")
df2019 = pd.read_csv("data/happiness/2019.csv")

# Slicing and extracting data

In [None]:
display(Image(filename='data/images/cute_panda_slicing.jpg'))

## Isolating one column

In [None]:
df2019["Score"]

In [None]:
df2019.Score

In [None]:
list(df2019.Score)

In [None]:
list(df2019["Country or region"])

## Isolating multiple columns

In [None]:
df2019[["Score", "Social support"]]

## Isolating one row

In [None]:
df2019.index

In [None]:
df2019.index == 0

In [None]:
df2019[df2019.index == 0]

## Isolating multiple rows

In [None]:
df2019[df2019.index.isin((0, 1))]

## Isolating rows using loc ("location") and iloc ("integer location")

In [None]:
# loc uses a label to point to a row, column or cell; iloc uses the numeric position
df2019.index

In [None]:
df2019.index = range(1, 157)

In [None]:
df2019.index

In [None]:
df2019.loc[1]

In [None]:
df2019.iloc[1]

In [None]:
df2019.loc[1:10]

In [None]:
df2019.iloc[1:10]

### Selecting specific columns

Basic syntax: `df.loc[row_label, column_label]`

In [None]:
df2019.loc[1:10, "Score"]

In [None]:
df2019.loc[1:10, ["Country or region", "Score"]]

In [None]:
df2019.iloc[1:10, 1:3]

## Conditional slicing

In [None]:
df2019.index == 10

In [None]:
mask = df2019.index == 10

In [None]:
df2019[mask]

In [None]:
df2019[df2019["Country or region"] == "Germany"]

In [None]:
df2019[df2019["Overall rank"] == 100]

In [None]:
df2019[df2019["Healthy life expectancy"] > 0.9]["Country or region"]

### More than one condition

In [None]:
df2019[(df2019["Healthy life expectancy"] > 0.9) & (df2019["Social support"] < 1.2)]["Country or region"]

### Using the sum() method

In [None]:
(df2019["Healthy life expectancy"] > 0.9).sum()

# Cleaning data

In [None]:
display(Image(filename='data/images/cute_pandas_cleaning.jpg'))

[Pandas cleaning video](https://www.youtube.com/watch?v=R2uiy27Xv_U)

In [None]:
df2019_withNulls.isnull().sum()

## Dropping missing values

In [None]:
df2019_withNulls.shape

In [None]:
# Dropping all rows that have a null value
df2019_withoutNulls1 = df2019_withNulls.dropna().shape

In [None]:
# Dropping all columns that have a null value
df2019_withoutNulls2 = df2019_withNulls.dropna(axis=1).shape

## Replacing missing values

In [None]:
meanScore = df2019_withNulls["Score"].mean()

In [None]:
meanScore

In [None]:
# Using fillna() method
df2019_withNulls.fillna(meanScore)["Score"]

In [None]:
# Alternatively with loc
df2019_withNulls.loc[df2019_withNulls["Score"].isna()] = meanScore

In [None]:
df2019_withNulls["Score"]

## Dealing with duplicate data

In [None]:
df2019Dupl = pd.concat([df2019, df2019])

In [None]:
df2019Dupl.shape

In [None]:
(df2019Dupl["Country or region"] == "Germany").sum()

In [None]:
df2019Dupl.drop_duplicates().shape

## Renaming columns

In [None]:
df2015 = pd.read_csv("data/happiness/2015.csv")

### Renaming individual columns

In [None]:
df2015.columns

##### What is the Dystopia Residual?

So in that report, this year the awesomest country is Finland - and then, as a baseline (and maybe to avoid dumping on Afghanistan all the time) they make up a hypothetical worst-country-in-the-world, which has the Central African Republic's life expectancy, Somalia's corruption, and so on. They call that country Dystopia.
If you draw a line connecting those two countries, that line gives you a rough idea how fast happiness increases as you improve the various criteria.

If you plug your country's life expectancy, corruption, etc into the line equation, it'll predict what your happiness rating should be. BUT...most countries don't fit the model perfectly. They're usually a little higher or lower than the prediction. The difference between the prediction and the actual value is called the residual. A positive residual means you're, for some reason, happier than the model would predict, and a negative residual means you're less happy for some reason. And a zero residual means you're exactly where the model predicted.

In the data spreadsheet, instead of giving raw residuals, they give "residual + Dystopia", and I'm not sure why they do that. Dystopia's happiness rating is 1.97, so if you take all those values and subtract 1.97 you'll see the actual residuals. Singapore and Hong Kong, for example, have conspicuously low ones; something's going on in Hong Kong that their model doesn't normally look at, which is making the citizens less happy. Finland's residual is substantial and positive, so the model says they should be quite happy, and in fact they're even happier than that.

Copied from [here](https://www.reddit.com/r/explainlikeimfive/comments/jdai5y/eli5_what_does_dystopian_residual_mean/).

In [None]:
df2015 = df2015.rename(columns={"Happiness Rank": "Rank", "Happiness Score": "Score"})

In [None]:
df2015.columns

In [None]:
df2015.Region

### Renaming all columns

In [None]:
newColumnNames = ["These", "are", "just", "some", "really", "random", "column", "names", "that", "dont", 
                  "mean", "anything"]

In [None]:
df2015.columns = newColumnNames

In [None]:
df2015

### Replacing values

In [None]:
df2019["Country or region"].unique()

In [None]:
df2019 = df2019.replace({"Country or region": {"Germany": "Schland", "United States": "Murica", 
                                               "Australia": "Oz"}})

In [None]:
sorted(df2019["Country or region"].unique())