In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import Image, display

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
# Data
df2015 = pd.read_csv("data/happiness/2015.csv")
df2016 = pd.read_csv("data/happiness/2016.csv")
df2017 = pd.read_csv("data/happiness/2017.csv")
df2018 = pd.read_csv("data/happiness/2018.csv")
df2019 = pd.read_csv("data/happiness/2019.csv")

# Data analysis

In [None]:
display(Image(filename='data/images/cute_panda_fun.jpg'))

## Summary operators (mean, mode, median)

In [None]:
df2019.mean()

In [None]:
columnsOfInterest = ["Score", "GDP per capita", "Social support"]
df2019[columnsOfInterest].mean()

In [None]:
columnsOfInterest = ["Score", "GDP per capita", "Social support"]
df2019[columnsOfInterest].median()

In [None]:
columnsOfInterest = ["Score", "GDP per capita", "Social support"]
df2019[columnsOfInterest].mode()

"The mode of a set of values is the value that appears most often. It can be multiple values."

In [None]:
df2019[df2019.Score==5.208]

##### Min, max

In [None]:
maxGDP = df2019["GDP per capita"].max()
df2019[df2019["GDP per capita"] == maxGDP]

In [None]:
minGDP = df2019["GDP per capita"].min()
df2019[df2019["GDP per capita"] == minGDP]

## Create new columns based on existing columns

In [None]:
df2019["Generosity new"] = df2019["Generosity"] * 100

In [None]:
df2019["Generosity new"]

## Counting

In [None]:
df2016 = pd.read_csv("data/happiness/2016.csv")

In [None]:
df2016.Region.value_counts()

##### Compute proportions

In [None]:
df2016.Region.value_counts(normalize=True)

##### Select multiple columns

In [None]:
df2016.value_counts(subset=["Region", "Country"])

## Aggregating data

Data aggregation is any process whereby data is gathered and expressed in a summary form.

In [None]:
df2016.groupby("Region").mean()

##### Sort by happiness score

In [None]:
df2016.groupby("Region").mean().sort_values(by='Happiness Score', ascending=False)

Calculate the mean only for a specific column.

In [None]:
df2016.groupby("Region")["Happiness Score"].mean()

Sort values.

In [None]:
df2016.groupby("Region")["Happiness Score"].mean().sort_values()

##### Alternative way

In [None]:
df2016.groupby("Region").agg('mean')

In [None]:
df2016.groupby("Region").agg({"Happiness Score": ["sum", "mean"]})

# Data visualization

In [None]:
display(Image(filename='data/images/cute_panda_painting.jpg'))

In [None]:
#Make the graphs a bit prettier
plt.style.use('ggplot')

## Using pandas' plotting functions

### Bar plots

In [None]:
df2019.columns

##### Number of countries per region

In [None]:
df2016["Region"].value_counts().plot.bar()

##### Happiness scores per region (sorted)

In [None]:
df2016.groupby("Region")["Happiness Score"].mean().sort_values().plot.bar()

##### Adjust image size

In [None]:
_, ax = plt.subplots(1, 1, figsize=(9, 9))

df2016.groupby("Region")["Happiness Score"].mean().sort_values().plot.bar(ax=ax)

### Histograms

In [None]:
df2016["Happiness Score"].plot.hist()

### Boxplots

In [None]:
display(Image(filename='data/images/cute_panda_box.jpg'))

In descriptive statistics, a box plot or boxplot (also known as a box and whisker plot) is a type of chart often used in explanatory data analysis. Box plots visually show the distribution of numerical data and skewness by displaying the data quartiles (or percentiles) and averages.

Box plots show the five-number summary of a set of data: including the minimum score, first (lower) quartile, median, third (upper) quartile, and maximum score.

In [None]:
display(Image(filename='data/images/box-whisker-plot.jpg'))

In [None]:
def rotateXaxesLabels(ax, rotation=90):
    ax.set_xticklabels(ax.get_xticklabels(), rotation=rotation)

##### Happiness score per region

In [None]:
_, ax = plt.subplots(1, 1, figsize=(7, 7))

df2016.boxplot(column=["Happiness Score"], by="Region", ax=ax)
rotateXaxesLabels(ax)

plt.tight_layout()

##### Sorted boxplot

In [None]:
grouped = df2016.groupby("Region")
df2016Boxplot = pd.DataFrame({region: values["Happiness Score"] for region, values in grouped})
medians = df2016Boxplot.median()
medians = medians.sort_values()

#df2016Boxplot.boxplot()

In [None]:
medians.index

In [None]:
df2016Boxplot = df2016Boxplot[medians.index]
ax = df2016Boxplot.boxplot()
rotateXaxesLabels(ax)

### Scatterplot

##### Happiness - freedom

In [None]:
df2016.plot.scatter(x="Freedom", y="Happiness Score")

##### Happiness - economy

In [None]:
df2016.plot.scatter(x="Economy (GDP per Capita)", y="Happiness Score")

##### Happiness - Trust (Government Corruption)

In [None]:
df2016.plot.scatter(x="Trust (Government Corruption)", y="Happiness Score")

##### Corrupt but happy countries

In [None]:
dfCorruptButHappy = df2016[(df2016["Trust (Government Corruption)"] < 0.05) & (df2016["Happiness Score"] > 6)]

corruptButHappyCountries = set(dfCorruptButHappy["Country"])

In [None]:
corruptButHappyCountries

In [None]:
colors = ["red" if country in corruptButHappyCountries else "blue" for country in df2016["Country"]]

In [None]:
df2016.plot.scatter(x="Trust (Government Corruption)", y="Happiness Score", color=colors)

## Using seaborn

### Bar plots

##### Number of countries per region

In [None]:
ax = sns.countplot(data=df2016, x="Region")
rotateXaxesLabels(ax)

##### Happiness scores per region (sorted)

In [None]:
df2016["Region"].value_counts()

In [None]:
orderRegionsByHappiness = df2016.groupby("Region")["Happiness Score"].mean().sort_values().index

In [None]:
ax = sns.barplot(data=df2016, x="Region", y="Happiness Score", order=orderRegionsByHappiness)
rotateXaxesLabels(ax)

### Histograms

In [None]:
sns.histplot(data=df2016, x="Happiness Score")

### Boxplots

##### Happiness scores per region

In [None]:
ax = sns.boxplot(data=df2016, x="Region", y="Happiness Score", order=orderRegionsByHappiness)
rotateXaxesLabels(ax)

### Scatterplots

In [None]:
df2016.columns

In [None]:
sns.scatterplot(data=df2016, x="Generosity", y="Happiness Score")

In [None]:
df2016[df2016["Generosity"] > 0.8]

##### Regression plot

In [None]:
sns.regplot(data=df2016, x="Economy (GDP per Capita)", y="Happiness Score")