<a href="https://colab.research.google.com/github/Madihasafi/my-colab-assignment/blob/main/Tell_a_Story_with_Global_Temperature_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries (Plotly is usually pre-installed in Colab)
!pip install plotly --upgrade

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

print("✅ Libraries imported successfully!")

Collecting plotly
  Downloading plotly-6.5.2-py3-none-any.whl.metadata (8.5 kB)
Downloading plotly-6.5.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.24.1
    Uninstalling plotly-5.24.1:
      Successfully uninstalled plotly-5.24.1
Successfully installed plotly-6.5.2
✅ Libraries imported successfully!


In [4]:
url = "https://data.giss.nasa.gov/gistemp/tabledata_v4/GLB.Ts+dSST.csv"

# Load into a pandas DataFrame
df = pd.read_csv(url)
print("✅ Dataset loaded successfully!")
print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")

✅ Dataset loaded successfully!
Shape: 148 rows, 1 columns


In [5]:
# Display first few rows
df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Land-Ocean: Global Means
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
1880,-.19,-.25,-.10,-.17,-.11,-.22,-.19,-.11,-.15,-.24,-.23,-.18,-.18,***,***,-.13,-.17,-.21
1881,-.21,-.15,.02,.04,.05,-.20,-.01,-.04,-.16,-.22,-.19,-.08,-.10,-.10,-.18,.04,-.08,-.19
1882,.15,.13,.04,-.18,-.15,-.24,-.17,-.08,-.15,-.24,-.17,-.37,-.12,-.09,.07,-.10,-.16,-.19
1883,-.30,-.37,-.13,-.19,-.18,-.07,-.08,-.14,-.22,-.11,-.25,-.12,-.18,-.20,-.35,-.17,-.10,-.19


In [6]:
# Check data types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 148 entries, ('Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'J-D', 'D-N', 'DJF', 'MAM', 'JJA') to ('2026', '1.08', '***', '***', '***', '***', '***', '***', '***', '***', '***', '***', '***', '***', '***', '***', '***', '***')
Data columns (total 1 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Land-Ocean: Global Means  148 non-null    object
dtypes: object(1)
memory usage: 56.5+ KB


In [7]:
# Summary statistics
df.describe()

Unnamed: 0,Land-Ocean: Global Means
count,148.0
unique,88.0
top,-0.07
freq,5.0


In [12]:
# The current DataFrame 'df' is severely malformed due to incorrect parsing in the previous cell.
# The data, including headers and rows, has been incorrectly stored within a MultiIndex,
# and there's a single column 'Land-Ocean: Global Means' that seems to contain redundant/unparsed data.
# The `KeyError: 'Date'` arises because no 'Date' column (or 'Year'/'Month' columns in a usable format) exist.

# From inspection of the previous cell's output (df.head(), df.info()), and the traceback,
# it appears pd.read_csv incorrectly parsed the file. It likely read the entire content
# as a single column with a default RangeIndex, or a MultiIndex where the first element is not a tuple.
# The TypeError indicates 'columns' became 0, meaning df.index.tolist() produced [0, 1, ...].
# This suggests df's index is a simple RangeIndex and the actual data (including headers) is
# embedded as strings in the single column 'Land-Ocean: Global Means'.

# Step 1: Re-read the malformed data using io.StringIO to parse the string content as CSV.
# This assumes the raw data we need is in the first column and its index was a RangeIndex.
# We'll use the original URL again, but this time with correct parsing parameters.
import io
import requests

# Fetch the raw CSV content
url = "https://data.giss.nasa.gov/gistemp/tabledata_v4/GLB.Ts+dSST.csv"
response = requests.get(url)
csv_content = response.text

# Use io.StringIO to treat the string content as a file and parse it with pandas.
# Skip the first line which is metadata "Land-Ocean: Global Means".
# header=0 means the second line (index 0 after skipping 1) is the header.
df = pd.read_csv(io.StringIO(csv_content), skiprows=1, header=0)


# Step 2: Clean up the new DataFrame
# The column 'Year' is currently an object (string). Convert it to numeric.
# Also, the last few rows might be metadata (e.g., 'Source: NASA/GISS'), which need to be removed.
df['Year'] = pd.to_numeric(df['Year'], errors='coerce') # Coerce non-numeric to NaN
df.dropna(subset=['Year'], inplace=True) # Drop rows where Year is NaN (i.e., metadata)
df['Year'] = df['Year'].astype(int) # Convert Year to integer

# Replace '***' values with NaN for monthly temperature anomalies.
# Identify month columns
month_columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for col in month_columns:
    if col in df.columns: # Check if column exists
        df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, errors='coerce' turns '***' to NaN

# Step 3: Melt the DataFrame to transform monthly columns into rows
# Keep 'Year' as the identifier
df_melted = df.melt(id_vars=['Year'], value_vars=month_columns, var_name='MonthName', value_name='Mean')

# Step 4: Create 'Month' numerical column and 'Date' column
month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}
df_melted['Month'] = df_melted['MonthName'].map(month_mapping)

# Ensure Year and Month are integers before creating date
df_melted['Year'] = df_melted['Year'].astype(int)
df_melted['Month'] = df_melted['Month'].astype(int)

# Combine 'Year' and 'Month' to create a 'Date' column (as the first day of the month)
df_melted['Date'] = pd.to_datetime(df_melted['Year'].astype(str) + '-' + df_melted['Month'].astype(str) + '-01')

# Now assign the melted DataFrame back to df to continue with subsequent operations
df = df_melted.copy()

# Step 5: Original operations - Check for missing values and drop
print("Missing values after melting and date creation:\n", df.isnull().sum())
df.dropna(subset=['Mean'], inplace=True) # Drop any rows with missing temperature values

# Update Year, Month, MonthName in the final df as per original intent, now derived from 'Date'
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['MonthName'] = df['Date'].dt.month_name()

print(f"Data now covers {df['Year'].min()} to {df['Year'].max()}")

Missing values after melting and date creation:
 Year          0
MonthName     0
Mean         11
Month         0
Date          0
dtype: int64
Data now covers 1880 to 2026


In [14]:
# Line chart using Plotly Express
fig1 = px.line(
    df,
    x='Date',
    y='Mean',
    title='Global Temperature Anomaly (1880–Present)',
    labels={'Mean': 'Temperature Anomaly (°C)', 'Date': 'Year'},
    template='plotly_white'
)

# Customize line color and add markers
fig1.update_traces(line=dict(color='crimson', width=2), mode='lines+markers', marker=dict(size=2))

# Add a horizontal line at zero for reference
fig1.add_hline(y=0, line_dash="dash", line_color="gray", opacity=0.5)

# Improve layout
fig1.update_layout(
    hovermode='x unified',
    xaxis=dict(rangeslider=dict(visible=True), type='date'),
    yaxis=dict(title='Temperature Anomaly (°C)'),
    title_font_size=20
)

fig1.show()

In [15]:
# Find the most recent year with complete data (all 12 months)
latest_year = df['Year'].max()
# Filter to that year
df_latest = df[df['Year'] == latest_year]

# If the latest year doesn't have 12 months, use the previous complete year
if len(df_latest) < 12:
    latest_year = df[df['Year'] < latest_year]['Year'].max()
    df_latest = df[df['Year'] == latest_year]

print(f"Using data from {latest_year}")

# Create a bar chart
fig2 = px.bar(
    df_latest,
    x='MonthName',
    y='Mean',
    title=f'Monthly Temperature Anomaly in {latest_year}',
    labels={'Mean': 'Temperature Anomaly (°C)', 'MonthName': 'Month'},
    color='Mean',
    color_continuous_scale='RdYlBu_r',  # red for warmer, blue for cooler
    template='plotly_white'
)

# Customize layout
fig2.update_layout(
    xaxis=dict(categoryorder='array', categoryarray=pd.date_range('2020-01-01', periods=12, freq='MS').strftime('%B')),
    yaxis=dict(title='Temperature Anomaly (°C)'),
    title_font_size=20
)

fig2.show()

Using data from 2025


In [16]:
# Box plot of temperature anomalies by month
fig3 = px.box(
    df,
    x='MonthName',
    y='Mean',
    title='Distribution of Global Temperature Anomalies by Month (1880–Present)',
    labels={'Mean': 'Temperature Anomaly (°C)', 'MonthName': 'Month'},
    color='MonthName',
    template='plotly_white'
)

# Reorder months
fig3.update_layout(
    xaxis=dict(categoryorder='array', categoryarray=pd.date_range('2020-01-01', periods=12, freq='MS').strftime('%B')),
    showlegend=False,
    title_font_size=20
)

fig3.show()

In [17]:
# Create a 'Decade' column
df['Decade'] = (df['Year'] // 10) * 10

# Calculate average anomaly per decade and month
decade_monthly = df.groupby(['Decade', 'MonthName'])['Mean'].mean().reset_index()

# Keep only decades from 1960 onwards for clarity
decade_monthly = decade_monthly[decade_monthly['Decade'] >= 1960]

# Create a grouped bar chart
fig4 = px.bar(
    decade_monthly,
    x='MonthName',
    y='Mean',
    color='Decade',
    barmode='group',
    title='Average Temperature Anomaly by Month and Decade (1960s–2020s)',
    labels={'Mean': 'Temperature Anomaly (°C)', 'MonthName': 'Month'},
    color_continuous_scale='Viridis',
    template='plotly_white'
)

fig4.update_layout(
    xaxis=dict(categoryorder='array', categoryarray=pd.date_range('2020-01-01', periods=12, freq='MS').strftime('%B')),
    title_font_size=20
)

fig4.show()

Summary of Findings

Overall warming: The global temperature anomaly has increased by about 1.2°C since 1880, with the steepest rise after 1970.

Seasonal patterns: While all months are now above the baseline, the seasonal cycle remains – for example, northern hemisphere summers are still the warmest months in absolute terms.

Variability: Temperature variability has increased in recent decades, leading to more frequent extreme events.

Decadal trends: Every decade since the 1960s has been warmer than the previous one across all months, confirming the relentless pace of global warming.

These interactive visualizations allow us to explore the data in depth and communicate the urgency of climate change.