## World Happiness Report

## Basic cleaning & consistency checks for files 2015 to 2019

### 01 Importing Libraries

In [4]:
#Import libraries
import pandas as pd
import numpy as np
import os

In [5]:
#turn folder into a string
path = r'C:\Users\melan\OneDrive\Career Foundry\World Happiness Report'

In [6]:
path

'C:\\Users\\melan\\OneDrive\\Career Foundry\\World Happiness Report'

## Import 2015 data

In [8]:
# Import dataframe of 2015 from Original Data
df_2015 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2015.csv'))

In [9]:
#first 5 rows
df_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [10]:
#check for missing values
df_2015.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [11]:
#there are no missing values
df_2015.shape

(158, 12)

In [12]:
#158 rows and 12 columns, we can see all 12 columns, no need to list them
#All column names are consistent and well labeled
#checking for duplicates
duplicates = df_2015.duplicated()
print("Number of duplicate rows: ", duplicates.sum())

Number of duplicate rows:  0


In [13]:
#check for datatypes
df_2015.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Standard Error                   float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

In [14]:
#checking for mixed column types
for col in df_2015.columns.tolist():
    weird = (df_2015[[col]].map(type) !=df_2015[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_2015[weird]) > 0:
        print(f"Column {col} has mixed types")
    

In [15]:
#There are no mixed column types
#checking to see if there are outliers 

In [16]:
# Function to detect outliers
def detect_outliers(df_2015):
    outliers = pd.df_2015()
    for col in df.select_dtypes(include=[float, int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        col_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not col_outliers.empty:
            outliers = outliers.append(col_outliers)
            print(f"Outliers detected in column {col}:")
            print(col_outliers)
    return outliers

In [17]:
#descriptive statistics
df_2015.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [18]:
#rename columns to match 2018 df and 2019 df
df_2015.rename(columns = {'Economy (GDP per Capita)': 'GDP per capita'}, inplace = True)
df_2015.rename(columns = {'Health (Life Expectancy)': 'Healthy life expectancy'}, inplace = True)
df_2015.rename(columns = {'Freedom': 'Freedom to make life choices'}, inplace = True)
df_2015.rename(columns = {'Trust (Government Corruption)': 'Perceptions of corruption'}, inplace = True)

In [19]:
#check to see the columns now
df_2015.columns

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Standard Error', 'GDP per capita', 'Family', 'Healthy life expectancy',
       'Freedom to make life choices', 'Perceptions of corruption',
       'Generosity', 'Dystopia Residual'],
      dtype='object')

In [20]:
#Export to Prepared Data
df_2015.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'df_2015_checked.csv'))

## Import 2016 data

In [22]:
# Import dataframe of 2016 from Original Data
df_2016 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2016.csv'))

In [23]:
#first 5 rows
df_2016.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [24]:
df_2016.shape

(157, 13)

In [25]:
#157 rows, 13 columns
#check for missing values
df_2016.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Lower Confidence Interval        0
Upper Confidence Interval        0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [26]:
#157 rows and 13 columns, we can see all 13 columns, no need to list them
#All column names are consistent and well labeled
#checking for duplicates
duplicates = df_2016.duplicated()
print("Number of duplicate rows: ", duplicates.sum())

Number of duplicate rows:  0


In [27]:
#check for datatypes
df_2016.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

In [28]:
#checking for mixed column types
for col in df_2016.columns.tolist():
    weird = (df_2016[[col]].map(type) !=df_2016[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_2016[weird]) > 0:
        print(f"Column {col} has mixed types")
    

In [29]:
#There are no mixed data types
#checking to see if there are outliers 
# Function to detect outliers
def detect_outliers(df_2016):
    outliers = pd.df_2016()
    for col in df.select_dtypes(include=[float, int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        col_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not col_outliers.empty:
            outliers = outliers.append(col_outliers)
            print(f"Outliers detected in column {col}:")
            print(col_outliers)
  
    if outliers.empty:
        print("No outliers detected.")

In [30]:
#descriptive statistics
df_2016.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0
mean,78.980892,5.382185,5.282395,5.481975,0.95388,0.793621,0.557619,0.370994,0.137624,0.242635,2.325807
std,45.46603,1.141674,1.148043,1.136493,0.412595,0.266706,0.229349,0.145507,0.111038,0.133756,0.54222
min,1.0,2.905,2.732,3.078,0.0,0.0,0.0,0.0,0.0,0.0,0.81789
25%,40.0,4.404,4.327,4.465,0.67024,0.64184,0.38291,0.25748,0.06126,0.15457,2.03171
50%,79.0,5.314,5.237,5.419,1.0278,0.84142,0.59659,0.39747,0.10547,0.22245,2.29074
75%,118.0,6.269,6.154,6.434,1.27964,1.02152,0.72993,0.48453,0.17554,0.31185,2.66465
max,157.0,7.526,7.46,7.669,1.82427,1.18326,0.95277,0.60848,0.50521,0.81971,3.83772


In [31]:
#rename columns to match 2018 df and 2019 df
df_2016.rename(columns = {'Economy (GDP per Capita)': 'GDP per capita'}, inplace = True)
df_2016.rename(columns = {'Health (Life Expectancy)': 'Healthy life expectancy'}, inplace = True)
df_2016.rename(columns = {'Freedom': 'Freedom to make life choices'}, inplace = True)
df_2016.rename(columns = {'Trust (Government Corruption)': 'Perceptions of corruption'}, inplace = True)

In [32]:
#check to see the columns now
df_2016.columns

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Lower Confidence Interval', 'Upper Confidence Interval',
       'GDP per capita', 'Family', 'Healthy life expectancy',
       'Freedom to make life choices', 'Perceptions of corruption',
       'Generosity', 'Dystopia Residual'],
      dtype='object')

In [33]:
#Export to Prepared Data
df_2016.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'df_2016_checked.csv'))

## Import 2017 data

In [35]:
# Import dataframe of 2017 from Original Data
df_2017 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2017.csv'))

In [36]:
#first five rows
df_2017.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [37]:
df_2017.shape

(155, 12)

In [38]:
#155 rows, 12 columns
#check for missing values
df_2017.isnull().sum()

Country                          0
Happiness.Rank                   0
Happiness.Score                  0
Whisker.high                     0
Whisker.low                      0
Economy..GDP.per.Capita.         0
Family                           0
Health..Life.Expectancy.         0
Freedom                          0
Generosity                       0
Trust..Government.Corruption.    0
Dystopia.Residual                0
dtype: int64

In [39]:
#157 rows and 13 columns, we can see all 13 columns, no need to list them
#not all column names are consistent and well labeled, we will fix this at the end of the 2017 section
#checking for duplicates
duplicates = df_2017.duplicated()
print("Number of duplicate rows: ", duplicates.sum())

Number of duplicate rows:  0


In [40]:
#check for datatypes
df_2017.dtypes

Country                           object
Happiness.Rank                     int64
Happiness.Score                  float64
Whisker.high                     float64
Whisker.low                      float64
Economy..GDP.per.Capita.         float64
Family                           float64
Health..Life.Expectancy.         float64
Freedom                          float64
Generosity                       float64
Trust..Government.Corruption.    float64
Dystopia.Residual                float64
dtype: object

In [41]:
#checking for mixed column types
for col in df_2017.columns.tolist():
    weird = (df_2017[[col]].map(type) !=df_2017[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_2017[weird]) > 0:
        print(f"Column {col} has mixed types")

In [42]:
#There are no mixed data types
#checking to see if there are outliers 
# Function to detect outliers
def detect_outliers(df_2017):
    outliers = pd.df_2017()
    for col in df.select_dtypes(include=[float, int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        col_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not col_outliers.empty:
            outliers = outliers.append(col_outliers)
            print(f"Outliers detected in column {col}:")
            print(col_outliers)
  
    if outliers.empty:
        print("No outliers detected.")

In [43]:
#descriptive statistics
df_2017.describe()

Unnamed: 0,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,78.0,5.354019,5.452326,5.255713,0.984718,1.188898,0.551341,0.408786,0.246883,0.12312,1.850238
std,44.888751,1.13123,1.118542,1.14503,0.420793,0.287263,0.237073,0.149997,0.13478,0.101661,0.500028
min,1.0,2.693,2.864884,2.521116,0.0,0.0,0.0,0.0,0.0,0.0,0.377914
25%,39.5,4.5055,4.608172,4.374955,0.663371,1.042635,0.369866,0.303677,0.154106,0.057271,1.591291
50%,78.0,5.279,5.370032,5.193152,1.064578,1.253918,0.606042,0.437454,0.231538,0.089848,1.83291
75%,116.5,6.1015,6.1946,6.006527,1.318027,1.414316,0.723008,0.516561,0.323762,0.153296,2.144654
max,155.0,7.537,7.62203,7.479556,1.870766,1.610574,0.949492,0.658249,0.838075,0.464308,3.117485


In [44]:
#rename columns to match 2018 df and 2019 df
df_2017.rename(columns = {'Happiness.Rank': 'Happiness Rank'}, inplace = True)
df_2017.rename(columns = {'Happiness.Score': 'Happiness Score'}, inplace = True)
df_2017.rename(columns = {'Whisker.high': 'Whisker high'}, inplace = True)
df_2017.rename(columns = {'Whisker.low': 'Whisker low'}, inplace = True)
df_2017.rename(columns = {'Economy..GDP.per.Capita.': 'GDP per capita'}, inplace = True)
df_2017.rename(columns = {'Health..Life.Expectancy.': 'Healthy life expectancy'}, inplace = True)
df_2017.rename(columns = {'Freedom': 'Freedom to make life choices'}, inplace = True)
df_2017.rename(columns = {'Trust..Government.Corruption.': 'Perceptions of corruption'}, inplace = True)
df_2017.rename(columns = {'Dystopia.Residual': 'Dystopia residual'}, inplace = True)

In [45]:
#check to see the columns now
df_2017.columns

Index(['Country', 'Happiness Rank', 'Happiness Score', 'Whisker high',
       'Whisker low', 'GDP per capita', 'Family', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Dystopia residual'],
      dtype='object')

In [46]:
#Export to Prepared Data
df_2017.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'df_2017_checked.csv'))

## Import 2018 data

In [48]:
# Import dataframe of 2018 from Original Data
df_2018 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2018.csv'))

In [49]:
df_2018.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [50]:
df_2018.shape

(156, 9)

In [51]:
#156 rows, 9 columns
#check for missing values
df_2018.isnull().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       1
dtype: int64

In [52]:
#1 missing value! To be addressed at the end of section 2018 data
#all column names are consistent and well labeled
#checking for duplicates
duplicates = df_2018.duplicated()
print("Number of duplicate rows: ", duplicates.sum())

Number of duplicate rows:  0


In [53]:
#check for datatypes
df_2018.dtypes

Overall rank                      int64
Country or region                object
Score                           float64
GDP per capita                  float64
Social support                  float64
Healthy life expectancy         float64
Freedom to make life choices    float64
Generosity                      float64
Perceptions of corruption       float64
dtype: object

In [54]:
#checking for mixed column types
for col in df_2018.columns.tolist():
    weird = (df_2018[[col]].map(type) !=df_2018[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_2018[weird]) > 0:
        print(f"Column {col} has mixed types")

In [55]:
#There are no mixed data types
#checking to see if there are outliers 
# Function to detect outliers
def detect_outliers(df_2018):
    outliers = pd.df_2018()
    for col in df.select_dtypes(include=[float, int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        col_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not col_outliers.empty:
            outliers = outliers.append(col_outliers)
            print(f"Outliers detected in column {col}:")
            print(col_outliers)
  
    if outliers.empty:
        print("No outliers detected.")

In [56]:
#descriptive statistics
df_2018.describe()

Unnamed: 0,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,155.0
mean,78.5,5.375917,0.891449,1.213237,0.597346,0.454506,0.181006,0.112
std,45.177428,1.119506,0.391921,0.302372,0.247579,0.162424,0.098471,0.096492
min,1.0,2.905,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.75,4.45375,0.61625,1.06675,0.42225,0.356,0.1095,0.051
50%,78.5,5.378,0.9495,1.255,0.644,0.487,0.174,0.082
75%,117.25,6.1685,1.19775,1.463,0.77725,0.5785,0.239,0.137
max,156.0,7.632,2.096,1.644,1.03,0.724,0.598,0.457


In [57]:
# Get the value counts for the 'Perceptions of corruption' column
perception_corruption = df_2018['Perceptions of corruption'].value_counts(dropna = False)

# Display the counts
print(perception_corruption)

Perceptions of corruption
0.082    5
0.074    4
0.052    4
0.106    3
0.064    3
        ..
0.259    1
0.123    1
0.014    1
0.019    1
0.038    1
Name: count, Length: 111, dtype: int64


In [58]:
# Impute with mean
df_2018['Perceptions of corruption'] = df_2018['Perceptions of corruption'].fillna(df_2018['Perceptions of corruption'].mean())

In [59]:
#let's check again for missing values
df_2018.isnull().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [60]:
#Export to Prepared Data
df_2018.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'df_2018_checked.csv'))

## Import 2019 data

In [62]:
# Import dataframe of 2019 from Original Data
df_2019 = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', '2019.csv'))

In [63]:
df_2019.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [64]:
df_2019.shape

(156, 9)

In [65]:
#156 rows, 9 columns
#check for missing values
df_2019.isnull().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [66]:
#No missing values
#all column names are consistent and well labeled
#checking for duplicates
duplicates = df_2019.duplicated()
print("Number of duplicate rows: ", duplicates.sum())

Number of duplicate rows:  0


In [67]:
#check for datatypes
df_2019.dtypes

Overall rank                      int64
Country or region                object
Score                           float64
GDP per capita                  float64
Social support                  float64
Healthy life expectancy         float64
Freedom to make life choices    float64
Generosity                      float64
Perceptions of corruption       float64
dtype: object

In [68]:
#checking for mixed column types
for col in df_2019.columns.tolist():
    weird = (df_2019[[col]].map(type) !=df_2019[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_2019[weird]) > 0:
        print(f"Column {col} has mixed types")

In [69]:
#There are no mixed data types
#checking to see if there are outliers 
# Function to detect outliers
def detect_outliers(df_2019):
    outliers = pd.df_2019()
    for col in df.select_dtypes(include=[float, int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        col_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not col_outliers.empty:
            outliers = outliers.append(col_outliers)
            print(f"Outliers detected in column {col}:")
            print(col_outliers)
  
    if outliers.empty:
        print("No outliers detected.")

In [70]:
#descriptive statistics
df_2019.describe()

Unnamed: 0,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0
mean,78.5,5.407096,0.905147,1.208814,0.725244,0.392571,0.184846,0.110603
std,45.177428,1.11312,0.398389,0.299191,0.242124,0.143289,0.095254,0.094538
min,1.0,2.853,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.75,4.5445,0.60275,1.05575,0.54775,0.308,0.10875,0.047
50%,78.5,5.3795,0.96,1.2715,0.789,0.417,0.1775,0.0855
75%,117.25,6.1845,1.2325,1.4525,0.88175,0.50725,0.24825,0.14125
max,156.0,7.769,1.684,1.624,1.141,0.631,0.566,0.453


In [71]:
#Export to Prepared Data
df_2019.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'df_2019_checked.csv'))