In [1]:
#World Happiness

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import scipy.stats as stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score
import plotly.express as px
import warnings  

# Ignore all warnings  
warnings.filterwarnings('ignore')

sns.set(style="whitegrid")
%matplotlib inline

In [3]:
world_df = pd.read_csv("Dataset/World_Happiness_2015_2017_.csv")

world_df.head()

Unnamed: 0,Country,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015


In [22]:
# to explore:
print(len(world_df))

470


In [17]:
#Columns present
print(world_df.columns)
print(world_df.nunique())  # See how many unique values are in each column
print(world_df.shape) 

Index(['Country', 'Happiness Rank', 'Happiness Score',
       'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
       'Freedom', 'Trust (Government Corruption)', 'Generosity',
       'Dystopia Residual', 'Year'],
      dtype='object')
Country                          166
Happiness Rank                   158
Happiness Score                  449
Economy (GDP per Capita)         467
Family                           468
Health (Life Expectancy)         466
Freedom                          462
Trust (Government Corruption)    466
Generosity                       468
Dystopia Residual                470
Year                               3
dtype: int64
(470, 11)


In [18]:
#Missing values
world_df.isnull().sum()

Country                          0
Happiness Rank                   0
Happiness Score                  0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Year                             0
dtype: int64

In [20]:
#Data types

world_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        470 non-null    object 
 1   Happiness Rank                 470 non-null    int64  
 2   Happiness Score                470 non-null    float64
 3   Economy (GDP per Capita)       470 non-null    float64
 4   Family                         470 non-null    float64
 5   Health (Life Expectancy)       470 non-null    float64
 6   Freedom                        470 non-null    float64
 7   Trust (Government Corruption)  470 non-null    float64
 8   Generosity                     470 non-null    float64
 9   Dystopia Residual              470 non-null    float64
 10  Year                           470 non-null    int64  
dtypes: float64(8), int64(2), object(1)
memory usage: 40.5+ KB


In [23]:
#Basic distributions
world_df.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year
count,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0
mean,78.829787,5.370728,0.92783,0.990347,0.579968,0.402828,0.175605,0.201426,2.092717,2015.993617
std,45.281408,1.136998,0.415584,0.318707,0.240161,0.150356,0.131909,0.133211,0.565772,0.816907
min,1.0,2.693,0.0,0.0,0.0,0.0,0.0,0.0,0.32858,2015.0
25%,40.0,4.509,0.605292,0.793,0.402301,0.297615,0.075792,0.098303,1.737975,2015.0
50%,79.0,5.2825,0.995439,1.025665,0.630053,0.418347,0.139081,0.181624,2.09464,2016.0
75%,118.0,6.23375,1.252443,1.228745,0.768298,0.51685,0.249839,0.275505,2.455575,2017.0
max,158.0,7.587,1.870766,1.610574,1.02525,0.66973,0.838075,0.81971,3.83772,2017.0


🔍 Step 2: Data Cleaning
Standardize column names (remove whitespace, lowercase, rename if inconsistent across years).

Check for and handle missing values (df.isnull().sum()).

Convert year column to int if necessary.

Ensure country names are consistent.

In [27]:
print(len(world_df['Country'].unique()))
print(world_df['Country'].unique())

166
['Switzerland' 'Iceland' 'Denmark' 'Norway' 'Canada' 'Finland'
 'Netherlands' 'Sweden' 'New Zealand' 'Australia' 'Israel' 'Costa Rica'
 'Austria' 'Mexico' 'United States' 'Brazil' 'Luxembourg' 'Ireland'
 'Belgium' 'United Arab Emirates' 'United Kingdom' 'Oman' 'Venezuela'
 'Singapore' 'Panama' 'Germany' 'Chile' 'Qatar' 'France' 'Argentina'
 'Czech Republic' 'Uruguay' 'Colombia' 'Thailand' 'Saudi Arabia' 'Spain'
 'Malta' 'Taiwan' 'Kuwait' 'Suriname' 'Trinidad and Tobago' 'El Salvador'
 'Guatemala' 'Uzbekistan' 'Slovakia' 'Japan' 'South Korea' 'Ecuador'
 'Bahrain' 'Italy' 'Bolivia' 'Moldova' 'Paraguay' 'Kazakhstan' 'Slovenia'
 'Lithuania' 'Nicaragua' 'Peru' 'Belarus' 'Poland' 'Malaysia' 'Croatia'
 'Libya' 'Russia' 'Jamaica' 'North Cyprus' 'Cyprus' 'Algeria' 'Kosovo'
 'Turkmenistan' 'Mauritius' 'Hong Kong' 'Estonia' 'Indonesia' 'Vietnam'
 'Turkey' 'Kyrgyzstan' 'Nigeria' 'Bhutan' 'Azerbaijan' 'Pakistan' 'Jordan'
 'Montenegro' 'China' 'Zambia' 'Romania' 'Serbia' 'Portugal' 'Latvia'
 'Ph

In [9]:
📊 Step 3: Exploratory Data Analysis (EDA)
Start with:

Overall distribution of happiness scores

Average happiness score per year

Happiness score by region (if available)

Correlation heatmap among variables (e.g., Economy, Family, Health, Freedom, Generosity, Trust)

Use visuals:

Histograms and boxplots for score distributions

Bar plots for top/bottom 10 happiest countries per year

Line plots for country-wise trends across 2015–2017

Pairplots or scatter plots for variable relationships

SyntaxError: invalid character '📊' (U+1F4CA) (1010261120.py, line 1)

In [10]:
📈 Step 4: Key Questions to Answer
Which factors are most correlated with happiness?

Which countries improved or declined in happiness the most over time?

Is GDP strongly linked to happiness?

Do people in freer countries report higher happiness?

SyntaxError: invalid character '📈' (U+1F4C8) (4183946922.py, line 1)

In [11]:
🧠 Step 5: Insights and Reporting
Summarize key findings:

What contributes most to happiness?

How does happiness shift geographically or over time?

Make visualizations to support each insight.

SyntaxError: invalid character '🧠' (U+1F9E0) (1102550786.py, line 1)

In [12]:
📦 Step 6: Optional – Modeling
Try linear regression to predict happiness score from other features.

Evaluate model performance using R², MAE, etc.

SyntaxError: invalid character '📦' (U+1F4E6) (574496257.py, line 1)

In [13]:
📚 Step 7: Wrap-Up
Create a Jupyter Notebook or Python script with:

Cleaned code

Visuals

Markdown cells explaining each section

Prepare a concise README with project purpose, methodology, key findings, and next steps.

SyntaxError: invalid character '📚' (U+1F4DA) (2021278752.py, line 1)