<a href="https://colab.research.google.com/github/Melatwolde/Weather-Data-Processing-Pipeline/blob/main/wather_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Weather Data Processing Pipeline**

1.Data Ingestion

In [2]:
import pandas as pd
df = pd.read_csv('sample_data/weather_data.csv')
print(df.head())

         date      city  temperature_celsius  humidity_percent  \
0  2023-01-01  New York                  5.0              60.0   
1  01/02/2023  New York                  NaN              65.0   
2  03-01-2023  New York                  7.0               NaN   
3         NaN    London                  8.0              70.0   
4  2023-01-02    London                  6.0              75.0   

   wind_speed_kph weather_condition  
0            10.0             Sunny  
1            12.0            Cloudy  
2             8.0             Rainy  
3            15.0           Unknown  
4            20.0             Snowy  


2.Data Cleaning and Transformation

In [4]:
df = df.dropna(subset=['date'])
df['temperature_celsius'] = df['temperature_celsius'].fillna(df.groupby('city')['temperature_celsius'].transform('mean'))
df.head()


Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,01/02/2023,New York,7.886667,65.0,12.0,Cloudy
2,03-01-2023,New York,7.0,,8.0,Rainy
4,2023-01-02,London,6.0,75.0,20.0,Snowy
5,01/03/2023,London,10.185714,80.0,18.0,Cloudy


In [5]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df.head()

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,,New York,7.886667,65.0,12.0,Cloudy
2,,New York,7.0,,8.0,Rainy
4,2023-01-02,London,6.0,75.0,20.0,Snowy
5,,London,10.185714,80.0,18.0,Cloudy


In [6]:
df['temperature_fahrenheit'] = df['temperature_celsius'] * 9/5 + 32
df = df[df['weather_condition'].notna()]
df = df[df['weather_condition'] != "Unknown"]
df.head()

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition,temperature_fahrenheit
0,2023-01-01,New York,5.0,60.0,10.0,Sunny,41.0
1,,New York,7.886667,65.0,12.0,Cloudy,46.196
2,,New York,7.0,,8.0,Rainy,44.6
4,2023-01-02,London,6.0,75.0,20.0,Snowy,42.8
5,,London,10.185714,80.0,18.0,Cloudy,50.334286


In [8]:
import csv

def filter(filepath):
    filtered = []
    with open(filepath, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for i in reader:
            condition = i.get('weather_condition')
            if condition and condition != "Unknown":
                filtered.append(i)
    return filtered[:5]
ans = filter("sample_data/weather_data.csv")
for i in ans:
    print(i)


{'date': '2023-01-01', 'city': 'New York', 'temperature_celsius': '5.0', 'humidity_percent': '60.0', 'wind_speed_kph': '10.0', 'weather_condition': 'Sunny'}
{'date': '01/02/2023', 'city': 'New York', 'temperature_celsius': '', 'humidity_percent': '65.0', 'wind_speed_kph': '12.0', 'weather_condition': 'Cloudy'}
{'date': '03-01-2023', 'city': 'New York', 'temperature_celsius': '7.0', 'humidity_percent': '', 'wind_speed_kph': '8.0', 'weather_condition': 'Rainy'}
{'date': '2023-01-02', 'city': 'London', 'temperature_celsius': '6.0', 'humidity_percent': '75.0', 'wind_speed_kph': '20.0', 'weather_condition': 'Snowy'}
{'date': '01/03/2023', 'city': 'London', 'temperature_celsius': '', 'humidity_percent': '80.0', 'wind_speed_kph': '18.0', 'weather_condition': 'Cloudy'}


3. Data Output

In [10]:
import pandas as pd
import numpy as np
import os


df = pd.read_csv("sample_data/weather_data.csv")


df = df.dropna(subset=['date'])

df['temperature_celsius'] = df['temperature_celsius'].fillna(
    df.groupby('city')['temperature_celsius'].transform('mean')
)


df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])
df['date'] = df['date'].dt.strftime('%Y-%m-%d')


df['temperature_fahrenheit'] = df['temperature_celsius'] * 9/5 + 32
df = df[df['weather_condition'].notna()]
df = df[df['weather_condition'] != "Unknown"]

os.makedirs("outputs", exist_ok=True)
df.to_csv("outputs/tranformed_weather_data.csv", index=False)

print(df.head(5).to_dict(orient='records'))


[{'date': '2023-01-01', 'city': 'New York', 'temperature_celsius': 5.0, 'humidity_percent': 60.0, 'wind_speed_kph': 10.0, 'weather_condition': 'Sunny', 'temperature_fahrenheit': 41.0}, {'date': '2023-01-02', 'city': 'London', 'temperature_celsius': 6.0, 'humidity_percent': 75.0, 'wind_speed_kph': 20.0, 'weather_condition': 'Snowy', 'temperature_fahrenheit': 42.8}, {'date': '2023-01-01', 'city': 'Tokyo', 'temperature_celsius': 10.0, 'humidity_percent': 50.0, 'wind_speed_kph': 5.0, 'weather_condition': 'Sunny', 'temperature_fahrenheit': 50.0}, {'date': '2023-01-12', 'city': 'Tokyo', 'temperature_celsius': 6.914285714285714, 'humidity_percent': 61.0, 'wind_speed_kph': nan, 'weather_condition': 'RAINY', 'temperature_fahrenheit': 44.44571428571429}, {'date': '2023-01-05', 'city': 'London', 'temperature_celsius': 10.185714285714285, 'humidity_percent': 60.0, 'wind_speed_kph': nan, 'weather_condition': 'Rainy', 'temperature_fahrenheit': 50.33428571428571}]


In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

os.makedirs("outputs", exist_ok=True)
avg_temp = df.groupby('city')['temperature_celsius'].mean().sort_values(ascending=False)


sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x=avg_temp.values, y=avg_temp.index, color='skyblue')

plt.xlabel('Average Temperature (°C)')
plt.ylabel('City')
plt.title('Average Temperature per City')
plt.tight_layout()


plt.savefig("outputs/average_temperature_per_city.png")
plt.close()
