In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

import matplotlib.pyplot as plt
import matplotlib as mlp
%matplotlib inline

Most lab tasks deal with classification problems. Since we chose a data set with a numerical output (i.e. a regression problem) we needed discretize our output variable into categories/classes

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,GDP per Capita,Social support,Life Expectancy,Freedom to make life choices,Perceptions of corruption,Generosity
0,0,2015,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,1,2015,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,2,2015,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,3,2015,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,4,2015,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


In [4]:
df_copy = df.drop(df.columns[0], axis=1)

In [5]:
df_copy.head()

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,GDP per Capita,Social support,Life Expectancy,Freedom to make life choices,Perceptions of corruption,Generosity
0,2015,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,2015,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,2015,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,2015,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,2015,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


# Add Category Column to pandas DataFrame with cut

## 3-class categorization

For now the simpest way to categorize the countries would be through the happiness rank as follows :
- 1st tier : 1-52
- 2nd tier : 53- 105
- 3rd tier : 106 - 158 

In [18]:
df_copy["tier"] = pd.cut(
    x=df["Happiness Rank"],
    bins=[0, 53, 106, np.inf],
    labels=["First", "Second", "Third"])

In [19]:
df_copy.head()

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,GDP per Capita,Social support,Life Expectancy,Freedom to make life choices,Perceptions of corruption,Generosity,tier
0,2015,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,First
1,2015,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,First
2,2015,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,First
3,2015,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,First
4,2015,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,First


In [20]:
df_copy.tail()

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,GDP per Capita,Social support,Life Expectancy,Freedom to make life choices,Perceptions of corruption,Generosity,tier
926,2021,Lesotho,145,3.512,0.451,0.731,0.007,0.405,0.015,0.103,Third
927,2021,Botswana,146,3.467,1.099,0.724,0.34,0.539,0.088,0.027,Third
928,2021,Rwanda,147,3.415,0.364,0.202,0.407,0.627,0.493,0.227,Third
929,2021,Zimbabwe,148,3.145,0.457,0.649,0.243,0.359,0.075,0.157,Third
930,2021,Afghanistan,149,2.523,0.37,0.0,0.126,0.0,0.01,0.122,Third


In [21]:
df_copy.to_csv('data_cat1.csv')

## 2-class categorization

In [22]:
df2 = df

In [23]:
df2 = df2.drop(df.columns[0], axis=1)

In [24]:
df2.head()

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,GDP per Capita,Social support,Life Expectancy,Freedom to make life choices,Perceptions of corruption,Generosity
0,2015,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,2015,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,2015,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,2015,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,2015,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


In [26]:
df2["tier"] = pd.cut(
    x=df["Happiness Rank"],
    bins=[0, 78, np.inf],
    labels=["First", "Second"])

In [27]:
df2.head()

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,GDP per Capita,Social support,Life Expectancy,Freedom to make life choices,Perceptions of corruption,Generosity,tier
0,2015,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,First
1,2015,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,First
2,2015,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,First
3,2015,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,First
4,2015,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,First


In [28]:
df2.to_csv('data_2_cat.csv')