# 2016 Dataset

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0272EN-SkillsNetwork/labs/dataset/2016.csv')

In [5]:
df.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


# Data Preparation

# Data Cleaning: Checking for correct data type

In [6]:
df.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval         object
Economy (GDP per Capita)          object
Family                           float64
Health (Life Expectancy)          object
Freedom                           object
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      153 non-null    float64
 5   Upper Confidence Interval      154 non-null    float64
 6   Economy (GDP per Capita)       155 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       154 non-null    float64
 9   Freedom                        156 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

# Data Cleaning: Change the data types

In [8]:
# Convert columns to numeric
cols_to_convert = ['Upper Confidence Interval', 'Economy (GDP per Capita)', 'Health (Life Expectancy)', 'Freedom']

for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check the data types again
df.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

In [10]:
###1. Remove leading and trailing whitespaces from the values in a column.
df1 = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [12]:
###2. Clean a column in a DataFrame by replacing empty strings with NaN values.
df1 = df1.replace(r'^\s*$', np.nan, regex=True)

In [14]:
###3. Change the data type of the columns to appropriate type as per the latest version of pandas.
cols_to_convert = ['Upper Confidence Interval', 'Economy (GDP per Capita)', 'Health (Life Expectancy)', 'Freedom']

for col in cols_to_convert:
    df1[col] = pd.to_numeric(df1[col], errors='coerce')

# Check the data types again
df1.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

# Data Cleaning - Checking for missing values

In [18]:
##1. Identify the columns of a data frame with missing values
missing_cols = df1.columns[df1.isnull().any()].tolist()
print(missing_cols)

['Lower Confidence Interval', 'Upper Confidence Interval', 'Economy (GDP per Capita)', 'Health (Life Expectancy)', 'Freedom']


In [20]:
##2. Replace the missing values thus identified with mean values of the column.
for col in missing_cols:
    df1[col].fillna(df1[col].mean(), inplace=True)

In [23]:
df1.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Lower Confidence Interval        0
Upper Confidence Interval        0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [27]:
##1.Generate Python code to Count the number of rows for each country
df1.Country.value_counts()

Denmark       1
Albania       1
Laos          1
Nigeria       1
Honduras      1
             ..
Kazakhstan    1
Moldova       1
Russia        1
Poland        1
Burundi       1
Name: Country, Length: 157, dtype: int64

In [33]:
##2.Generate Python code to Calculate AVG Happiness score, Region wise for comparison
avg_happiness_score = df1.pivot_table(values= 'Happiness Score', index = 'Region',aggfunc= 'mean')
avg_happiness_score

##avg_happiness_score = df1.groupby('Region')['Happiness Score'].mean()

Unnamed: 0_level_0,Happiness Score
Region,Unnamed: 1_level_1
Australia and New Zealand,7.3235
Central and Eastern Europe,5.37069
Eastern Asia,5.624167
Latin America and Caribbean,6.10175
Middle East and Northern Africa,5.386053
North America,7.254
Southeastern Asia,5.338889
Southern Asia,4.563286
Sub-Saharan Africa,4.136421
Western Europe,6.685667


# Data Insights and Visualization

# Bar Chart

In [34]:
## 1. Identify the GDP per capita and Healthy Life Expectancy of the top 10 countries.
import plotly.graph_objects as go

# Sort the dataframe by Happiness Score and select the top 10
sorted_df = df1.sort_values('Happiness Score', ascending=False).head(10)

In [35]:
sorted_df

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596
5,Canada,North America,6,7.404,7.335,7.473,1.44015,1.0961,0.8276,0.5737,0.31329,0.44834,2.70485
6,Netherlands,Western Europe,7,7.339,7.284,7.394,1.46468,1.02912,0.81231,0.55211,0.29927,0.47416,2.70749
7,New Zealand,Australia and New Zealand,8,7.334,7.264,7.404,1.36066,1.17278,0.83096,0.58147,0.41904,0.49401,2.47553
8,Australia,Australia and New Zealand,9,7.313,7.241,7.385,1.44443,1.10476,0.8512,0.56837,0.32331,0.47407,2.5465
9,Sweden,Western Europe,10,7.291,7.227,5.472753,1.45181,1.08764,0.83121,0.58218,0.40867,0.38254,2.54734


In [40]:
#Create a bar chart
fig1 = go.Figure()
fig1.add_trace(go.Bar(x=sorted_df['Country'], y=sorted_df['Economy (GDP per Capita)'], name = 'GDP per Capita'))
fig1.add_trace(go.Bar(x=sorted_df['Country'], y=sorted_df['Health (Life Expectancy)'], name = 'Healthy Life Expectancy'))

#Update the layout
fig1.update_layout(title= 'GDP per Capita and Health Life Expectancy of Top 10 countries', xaxis_title = 'Country', yaxis_title = 'Value', barmode = 'group')

#displaying the fig
fig1.show()

In [44]:
##1. Create a sub-dataset including Economy (GDP per Capita), Family, Health (Life Expectancy), Freedom, Trust (Government Corruption), Generosity, and Happiness Score attributes from the dataframe (df).

##Creating sub table from original table
sub_df = df1[['Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Happiness Score']]

##Save this Sub table into csv format
sub_df.to_csv('C:/Users/Kamakshi/Desktop/Alabs/Python/Python 2016 dataset/sub_dataset-1a268a.csv', index=False)

# Correlatio Matrix (Heatmap)

In [79]:
##2. Find the correlation between the attributes in the subdataset as a heatmap named fig2 using Plotly of width 800 and height 600.
import plotly.figure_factory as ff
###!pip install numpy==1.24.1

# Calculate the correlation matrix
corr = sub_df.corr()

# Create a heatmap
fig2 = ff.create_annotated_heatmap(z=corr.values, x=list(corr.columns), y=list(corr.index), colorscale='Viridis')

# Update the layout
fig2.update_layout(width=800, height=600, title='Correlation Heatmap')

# Display the figure
fig2.show()

# Scatter Plot

In [50]:
##Write a code that creates a scatter plot named fig3 between Happiness Score and GDP per Capita attributes of a dataframe using Plotly. Use Region to color the data points on the scatter plot.
import plotly.express as px

# Create a scatter plot
fig3 = px.scatter(df1, x='Happiness Score', y='Economy (GDP per Capita)', color='Region')

# Display the figure
fig3.show()

# Pie Chart

In [52]:
##Write a Plotly code that creates a pie chart named fig4 to present Happiness Score by Region attributes of dataframe df.
import plotly.express as px

# Group the data by Region and calculate the sum of Happiness Score
grouped_df = df1.groupby('Region')['Happiness Score'].sum().reset_index()

# Create a pie chart
fig4 = px.pie(grouped_df, values='Happiness Score', names='Region', title='Happiness Score by Region')

# Display the figure
fig4.show()

# MAP

In [53]:
##Write a Plotly code that creates a map named fig5 to display GDP per capita of countries and include Healthy Life Expectancy to be shown as a tooltip.
import plotly.express as px

# Create a map
fig5 = px.choropleth(df1, locations='Country', locationmode='country names', color='Economy (GDP per Capita)', hover_data=['Health (Life Expectancy)'], title='GDP per Capita of Countries')

# Display the figure
fig5.show()

In [71]:
##Write Python code to write any four of the Plotly figures (fig1, fig2, fig3, fig4, fig5) to a single HTML file named “dashboard.html”?
from plotly.subplots import make_subplots

# Create subplot with 3 rows and 2 columns
fig = make_subplots(rows=2, cols=2, subplot_titles=('Fig1', 'Fig2', 'Fig3', 'Fig4', 'Fig5'))

# Add figures to the subplot
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=2, col=1)
##fig.add_trace(fig4['data'][0], row=2, col=2)
##fig.add_trace(fig5['data'][0], row=3, col=1)

# Write to HTML
fig.write_html('C:/Users/Kamakshi/Desktop/Alabs/Python/Python 2016 dataset/dashboard-7a5394.html')

## Plotly package is unable to save/add piechart and chooropleth to the dashboard. So, use panel package to save all types of chart in one dashboard.
## Even you can use Dash package but getting some errors and this dash dashboard is displayed in jupyter notebook.

In [75]:
import panel as pn

# Create a Panel Column with your figures
dashboard = pn.Column(fig1, fig2, fig3, fig4, fig5)

# Show the dashboard
dashboard.show()

dashboard.save('C:/Users/Kamakshi/Desktop/Alabs/Python/Python 2016 dataset/dashboard-7a5394.html')

Launching server at http://localhost:54057


In [73]:
##!pip install dash
import dash
import dash_core_components as dcc
import dash_html_components as html

app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(figure=fig1),
    dcc.Graph(figure=fig2),
    dcc.Graph(figure=fig3),
    html.Iframe(src='https://plotly.com/~your_username/your_map_id.embed', width='100%', height='600')
])

if __name__ == '__main__':
    app.run_server(debug=True)

Collecting dash
  Downloading dash-2.18.2-py3-none-any.whl (7.8 MB)
     ---------------------------------------- 7.8/7.8 MB 12.2 MB/s eta 0:00:00
Collecting dash-table==5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash-core-components==2.0.0
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting dash-html-components==2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, retrying, dash
Successfully installed dash-2.18.2 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 retrying-1.3.4




The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`



The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`



# Conclusion