In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Replace this with your actual GitHub RAW URL
url = 'https://media.githubusercontent.com/media/LiamDuero03/DS-Society-Project/refs/heads/main/3-Data-Wrangling/messy_weather_data.csv'
df = pd.read_csv(url)

print("Dataset Loaded Successfully!")
df.head(10)

Dataset Loaded Successfully!


Unnamed: 0,Country,Temperature,Weather_Condition
0,Mexico,15.0,Rainy
1,USA,10.1,Cloudy
2,Mexico,30.4,Rainy
3,Mexico,27.7,
4,Canadâ,28.2,Rainy
5,USA,29.3,sunny
6,Mexico,-999.0,SNOW
7,Canada,19.0,Rainy
8,Mexico,12.9,Snowy
9,mexico,31.6,Sunny


In [10]:
# Check for Nulls
print("--- Missing Values ---")
print(df.isnull().sum())

# Check for Outliers (Look at min and max!)
print("\n--- Numerical Summary ---")
print(df.describe())

# Check for Naming Inconsistencies
print("\n--- Unique Countries ---")
print(df['Country'].unique())

--- Missing Values ---
Country               0
Temperature          10
Weather_Condition    10
dtype: int64

--- Numerical Summary ---
       Temperature
count    90.000000
mean     26.552222
std     158.285726
min    -999.000000
25%      15.750000
50%      22.000000
75%      28.725000
max     999.900000

--- Unique Countries ---
['Mexico' 'USA' 'Canadâ' 'Canada' 'mexico' ' CANADA ' ' USA ' 'MEXICO '
 'uSA']


In [11]:
# Clean Country names
df['Country'] = df['Country'].str.strip().str.upper()
df['Country'] = df['Country'].replace({'CANADÂ': 'CANADA', 'MEXICO': 'MEXICO'}) # Standardizing

# Clean Weather Condition
df['Weather_Condition'] = df['Weather_Condition'].str.strip().str.capitalize()
df['Weather_Condition'] = df['Weather_Condition'].replace({'Snow': 'Snowy'})

print("Fixed Naming Inconsistencies!")

Fixed Naming Inconsistencies!


In [12]:
# We define a 'logical' range for weather on Earth
valid_range = (df['Temperature'] > -60) & (df['Temperature'] < 60)
df = df[valid_range]

print(f"Removed outliers. New Max Temp: {df['Temperature'].max()}")

Removed outliers. New Max Temp: 34.3


In [13]:
# --- STEP 1: Fill Temperature based on Country Median ---
# transform('median') keeps the index the same so it fits back into the dataframe perfectly
df['Temperature'] = df['Temperature'].fillna(
    df.groupby('Country')['Temperature'].transform('median')
)

# --- STEP 2: Remove rows where Weather_Condition is Null ---
# We use the 'subset' parameter to ONLY target rows missing weather data
df_clean = df.dropna(subset=['Weather_Condition']).copy()

print("1. Temperatures filled using Country-specific medians.")
print("2. Rows with missing Weather_Condition have been removed.")

1. Temperatures filled using Country-specific medians.
2. Rows with missing Weather_Condition have been removed.


In [14]:
print("--- Final Cleaned Data Statistics ---")
print(df.describe())
print("\n--- Final Unique Values ---")
print(df['Country'].unique())
print(df['Weather_Condition'].unique())

# Show the first few rows of the beautiful, clean data
df.head()

--- Final Cleaned Data Statistics ---
       Temperature
count    87.000000
mean     21.710345
std       7.220256
min      10.100000
25%      15.800000
50%      21.800000
75%      28.200000
max      34.300000

--- Final Unique Values ---
['MEXICO' 'USA' 'CANADA']
['Rainy' 'Cloudy' nan 'Sunny' 'Snowy']


Unnamed: 0,Country,Temperature,Weather_Condition
0,MEXICO,15.0,Rainy
1,USA,10.1,Cloudy
2,MEXICO,30.4,Rainy
3,MEXICO,27.7,
4,CANADA,28.2,Rainy
