<a href="https://colab.research.google.com/github/Jessica-Emereuwa/GoMyCode-DataScience/blob/main/data_visualization__with_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Instructions

Load the dataset into a data frame using Python.
Clean the data as needed.
Plot a line chart to show the average temperature fluctuations in Tunisia and Cameroon. Interpret the results.
Zoom in to only include data between 1980 and 2005, try to customize the axes labels.
Create Histograms to show temperature distribution in Senegal between [1980,2000] and [2000,2023] (in the same figure). Describe the obtained results.
Select the best chart to show the Average temperature per country.
Make your own questions about the dataset and try to answer them using the appropriate visuals.


How We'll Evaluate

Your checkpoint will be evaluated based on these criteria, each rated from 0 to 5.

What We're Looking For

Successfully install and import Plotly package

Successfully create visuals

Successfully customize visuals (labels, legends ...)
"""

In [43]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


**LOAD DATA SET**

In [44]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Africa_climate_change.csv')
df.head(10)

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY
0,19800101 000000,,54.0,61.0,43.0,Tunisia
1,19800101 000000,,49.0,55.0,41.0,Tunisia
2,19800101 000000,0.0,72.0,86.0,59.0,Cameroon
3,19800101 000000,,50.0,55.0,43.0,Tunisia
4,19800101 000000,,75.0,91.0,,Cameroon
5,19800101 000000,,52.0,61.0,45.0,Tunisia
6,19800101 000000,0.0,76.0,97.0,59.0,Senegal
7,19800101 000000,0.0,74.0,95.0,59.0,Senegal
8,19800101 000000,0.0,78.0,93.0,63.0,Senegal
9,19800101 000000,0.0,76.0,91.0,59.0,Senegal


**CLEAN DATA**

In [45]:
# Parse the DATE column to extract the year
df['DATE'] = pd.to_datetime(df['DATE'], format='%Y%m%d %H%M%S')
df['YEAR'] = df['DATE'].dt.year

df

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
0,1980-01-01,,54.0,61.0,43.0,Tunisia,1980
1,1980-01-01,,49.0,55.0,41.0,Tunisia,1980
2,1980-01-01,0.0,72.0,86.0,59.0,Cameroon,1980
3,1980-01-01,,50.0,55.0,43.0,Tunisia,1980
4,1980-01-01,,75.0,91.0,,Cameroon,1980
...,...,...,...,...,...,...,...
464810,2023-08-23,,85.0,89.0,,Senegal,2023
464811,2023-08-23,,80.0,90.0,72.0,Egypt,2023
464812,2023-08-23,,87.0,,73.0,Egypt,2023
464813,2023-08-23,0.0,82.0,94.0,,Senegal,2023


In [6]:
df.isnull()

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
0,False,True,False,False,False,False,False
1,False,True,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False
4,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...
464810,False,True,False,False,True,False,False
464811,False,True,False,False,False,False,False
464812,False,True,False,True,False,False,False
464813,False,False,False,False,True,False,False


In [46]:
#Checking for Missing values
df.isnull().sum()

DATE            0
PRCP       287240
TAVG         6376
TMAX       100914
TMIN       132058
COUNTRY         0
YEAR            0
dtype: int64

In [47]:
# Dropping rows with missing values for simplicity
df = df.dropna()
df

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
2,1980-01-01,0.00,72.0,86.0,59.0,Cameroon,1980
6,1980-01-01,0.00,76.0,97.0,59.0,Senegal,1980
7,1980-01-01,0.00,74.0,95.0,59.0,Senegal,1980
8,1980-01-01,0.00,78.0,93.0,63.0,Senegal,1980
9,1980-01-01,0.00,76.0,91.0,59.0,Senegal,1980
...,...,...,...,...,...,...,...
464778,2023-08-22,0.00,85.0,93.0,81.0,Senegal,2023
464786,2023-08-23,0.00,87.0,101.0,71.0,Tunisia,2023
464799,2023-08-23,0.00,90.0,102.0,80.0,Tunisia,2023
464803,2023-08-23,1.22,83.0,90.0,76.0,Senegal,2023


In [48]:
# Check the data types
df.dtypes

DATE       datetime64[ns]
PRCP              float64
TAVG              float64
TMAX              float64
TMIN              float64
COUNTRY            object
YEAR                int32
dtype: object

**PLOT LINE CHART  TO SHOW THE AVERAGE TEMPERATURE FLUCTUATION IN TUNISIA AND CAMEROON**

In [49]:
# Filter data for Tunisia and Cameroon
tunisia_cameroon_df = df[df['COUNTRY'].isin(['Tunisia', 'Cameroon'])]

tunisia_cameroon_df

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
2,1980-01-01,0.0,72.0,86.0,59.0,Cameroon,1980
31,1980-01-02,0.0,75.0,91.0,61.0,Cameroon,1980
41,1980-01-02,0.0,81.0,90.0,75.0,Cameroon,1980
43,1980-01-02,0.0,76.0,95.0,59.0,Cameroon,1980
66,1980-01-03,0.0,80.0,91.0,73.0,Cameroon,1980
...,...,...,...,...,...,...,...
464752,2023-08-21,0.0,86.0,98.0,74.0,Tunisia,2023
464775,2023-08-22,0.0,88.0,99.0,79.0,Tunisia,2023
464786,2023-08-23,0.0,87.0,101.0,71.0,Tunisia,2023
464799,2023-08-23,0.0,90.0,102.0,80.0,Tunisia,2023


In [50]:
# Calculate the average temperature per year for both countries
avg_temp_tunisia_cameroon = tunisia_cameroon_df.groupby(['YEAR', 'COUNTRY'])['TAVG'].mean().reset_index()

avg_temp_tunisia_cameroon

Unnamed: 0,YEAR,COUNTRY,TAVG
0,1980,Cameroon,77.912141
1,1981,Cameroon,78.091408
2,1982,Cameroon,76.366864
3,1982,Tunisia,68.802867
4,1983,Cameroon,76.247748
...,...,...,...
81,2021,Tunisia,72.301397
82,2022,Cameroon,73.734375
83,2022,Tunisia,71.764624
84,2023,Cameroon,78.178571


In [51]:
# Plot the line chart
fig = px.line(avg_temp_tunisia_cameroon, x='YEAR', y='TAVG', color='COUNTRY', title='Average Temperature Fluctuations in Tunisia and Cameroon')
fig.update_layout(xaxis_title='Year', yaxis_title='Average Temperature (°C)')
fig.show()

In [None]:
# descring the line chart

"""
In this line chart, Cameroon is represented by the blue line and Tunisia is represented with the red line.
Cameroon temperature has higher in the year 2004 and lower in the year 2022
Tunisia temperature has higher in the year 2021 and lower in the year 2001
"""

**ZOOM IN TO ONLY INCLUDE DATA BETWEEN 1980 AND 2005**

In [53]:
# Filter data between 1980 and 2005
zoomed_data = avg_temp_tunisia_cameroon[(avg_temp_tunisia_cameroon['YEAR'] >= 1980) & (avg_temp_tunisia_cameroon['YEAR'] <= 2005)]

# Plot the zoomed-in line chart
fig = px.line(zoomed_data, x='YEAR', y='TAVG', color='COUNTRY', title='Average Temperature Fluctuations in Tunisia and Cameroon (1980-2005)')
fig.update_layout(xaxis_title='Year (1980-2005)', yaxis_title='Average Temperature (°C)')
fig.show()


**Histograms to show temperature distribution in Senegal between [1980,2000] and [2000,2023]**

In [54]:
# Filter data for Tunisia and Cameroon
senegal_df = df[df['COUNTRY'].isin(['Senegal'])]

senegal_df

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
6,1980-01-01,0.00,76.0,97.0,59.0,Senegal,1980
7,1980-01-01,0.00,74.0,95.0,59.0,Senegal,1980
8,1980-01-01,0.00,78.0,93.0,63.0,Senegal,1980
9,1980-01-01,0.00,76.0,91.0,59.0,Senegal,1980
13,1980-01-01,0.00,74.0,81.0,66.0,Senegal,1980
...,...,...,...,...,...,...,...
464750,2023-08-21,1.38,83.0,94.0,77.0,Senegal,2023
464751,2023-08-21,0.00,85.0,96.0,78.0,Senegal,2023
464762,2023-08-22,0.00,84.0,89.0,82.0,Senegal,2023
464778,2023-08-22,0.00,85.0,93.0,81.0,Senegal,2023


In [55]:
# Split data into two periods
senegal_1980_2000 = senegal_df[(senegal_df['YEAR'] >= 1980) & (senegal_df['YEAR'] <= 2000)]

senegal_1980_2000

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
6,1980-01-01,0.00,76.0,97.0,59.0,Senegal,1980
7,1980-01-01,0.00,74.0,95.0,59.0,Senegal,1980
8,1980-01-01,0.00,78.0,93.0,63.0,Senegal,1980
9,1980-01-01,0.00,76.0,91.0,59.0,Senegal,1980
13,1980-01-01,0.00,74.0,81.0,66.0,Senegal,1980
...,...,...,...,...,...,...,...
206287,2000-10-26,0.24,81.0,89.0,74.0,Senegal,2000
206288,2000-10-26,0.00,86.0,96.0,70.0,Senegal,2000
206295,2000-10-26,0.00,85.0,96.0,73.0,Senegal,2000
206301,2000-10-27,0.00,83.0,92.0,75.0,Senegal,2000


In [56]:
senegal_2000_2023 = senegal_df[(senegal_df['YEAR'] > 2000) & (senegal_df['YEAR'] <= 2023)]

senegal_2000_2023

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR
211912,2001-05-24,0.03,94.0,105.0,83.0,Senegal,2001
211937,2001-05-25,0.00,95.0,108.0,81.0,Senegal,2001
212252,2001-06-09,1.26,78.0,90.0,71.0,Senegal,2001
212373,2001-06-14,0.02,84.0,97.0,74.0,Senegal,2001
212388,2001-06-14,2.44,86.0,98.0,73.0,Senegal,2001
...,...,...,...,...,...,...,...
464750,2023-08-21,1.38,83.0,94.0,77.0,Senegal,2023
464751,2023-08-21,0.00,85.0,96.0,78.0,Senegal,2023
464762,2023-08-22,0.00,84.0,89.0,82.0,Senegal,2023
464778,2023-08-22,0.00,85.0,93.0,81.0,Senegal,2023


In [74]:
# Create histograms
fig = go.Figure()
fig.add_trace(go.Histogram(x=senegal_1980_2000['TAVG'], name='1980-2000', opacity=0.75, nbinsx=20))
fig.add_trace(go.Histogram(x=senegal_2000_2023['TAVG'], name='2000-2023', opacity=0.75, nbinsx=20))

In [75]:
# Overlay both histograms
fig.update_layout(barmode='overlay', title='Temperature Distribution in Senegal (1980-2000 vs 2000-2023)')
fig.update_traces(opacity=0.75)
fig.update_layout(xaxis_title='Temperature (°C)', yaxis_title='Count')
fig.show()

In [None]:
# Describe the obtained results
"""
- The histogram for the period 1980-2000 is shown in blue color.
- The histogram for the period 2000-2023 is shown in red color.
- The histograms are overlaid to allow for comparison of temperature distributions between the two periods.
"""

**Select the best chart to show the Average temperature per country.**

In [24]:
# Calculate the average temperature per country
avg_temp_per_country = df.groupby('COUNTRY')['TAVG'].mean().reset_index()

avg_temp_per_country

Unnamed: 0,COUNTRY,TAVG
0,Angola,76.25
1,Cameroon,78.109973
2,Egypt,73.018479
3,Senegal,82.983125
4,Tunisia,68.266263


In [None]:
# Plot the bar chart
fig = px.bar(avg_temp_per_country, x='COUNTRY', y='TAVG', title='Average Temperature per Country')
fig.update_layout(xaxis_title='Country', yaxis_title='Average Temperature (°C)')
fig.show()

**Make your own questions about the dataset and try to answer them using the appropriate visuals.**

In [34]:
#Top 5 Countries with Highest Average Temperature

# Sort countries by average temperature
top_5_countries = avg_temp_per_country.sort_values(by='TAVG', ascending=False).head(5)

top_5_countries


Unnamed: 0,COUNTRY,TAVG
3,Senegal,82.983125
1,Cameroon,78.109973
0,Angola,76.25
2,Egypt,73.018479
4,Tunisia,68.266263


In [35]:
# Plot the bar chart
fig = px.bar(top_5_countries, x='COUNTRY', y='TAVG', title='Top 5 Countries with Highest Average Temperature')
fig.update_layout(xaxis_title='Country', yaxis_title='Average Temperature (°C)')
fig.show()