In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 1. Data Ingestion

In [2]:
df =pd.read_csv('weather_data.csv')
df

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition
0,2023-01-01,New York,5.0,60.0,10.0,Sunny
1,01/02/2023,New York,,65.0,12.0,Cloudy
2,03-01-2023,New York,7.0,,8.0,Rainy
3,,London,8.0,70.0,15.0,Unknown
4,2023-01-02,London,6.0,75.0,20.0,Snowy
...,...,...,...,...,...,...
95,01-01-2023,London,,,,Rainy
96,09-01-2023,London,,,,Rainy
97,2023-01-11,Tokyo,,,,Sunny
98,15/01/2023,New York,,41.0,24.3,


In [3]:
df.isnull().sum()

date                   20
city                    0
temperature_celsius    58
humidity_percent       53
wind_speed_kph         45
weather_condition      16
dtype: int64

# 2.Data Cleaning and Trnsformation

In [19]:
from sklearn.impute import SimpleImputer

In [20]:
numerical_cols = df.select_dtypes(include = ['number']).columns
print('Numerical columns:', numerical_cols.tolist())

Numerical columns: ['temperature_celsius', 'humidity_percent', 'wind_speed_kph', 'temperature_fahrenheit']


In [21]:
catagorical_cols = df.select_dtypes(include=['object']).columns
print('Catagorical Columns:', catagorical_cols.tolist())

Catagorical Columns: ['date', 'city', 'weather_condition']


In [22]:
imputer_1 = SimpleImputer(strategy ='mean')
df[numerical_cols] = imputer_1.fit_transform(df[numerical_cols])
imputer_2 = SimpleImputer(strategy = 'most_frequent')
df[catagorical_cols] = imputer_2.fit_transform(df[catagorical_cols])
df

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition,temperature_fahrenheit
0,01-01-2023,New York,5.0,60.000000,10.000000,sunny,41.00
1,01-02-2023,New York,8.4,65.000000,12.000000,cloudy,47.12
2,03-01-2023,New York,7.0,58.978723,8.000000,rainy,44.60
3,14-01-2023,London,8.0,70.000000,15.000000,unknown,46.40
4,02-01-2023,London,6.0,75.000000,20.000000,snowy,42.80
...,...,...,...,...,...,...,...
95,01-01-2023,London,8.4,58.978723,14.352727,rainy,47.12
96,09-01-2023,London,8.4,58.978723,14.352727,rainy,47.12
97,11-01-2023,Tokyo,8.4,58.978723,14.352727,sunny,47.12
98,15-01-2023,New York,8.4,41.000000,24.300000,unknown,47.12


In [23]:
df['temperature_fahrenheit'] = df['temperature_celsius'] * 9 / 5 + 32
df

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition,temperature_fahrenheit
0,01-01-2023,New York,5.0,60.000000,10.000000,sunny,41.00
1,01-02-2023,New York,8.4,65.000000,12.000000,cloudy,47.12
2,03-01-2023,New York,7.0,58.978723,8.000000,rainy,44.60
3,14-01-2023,London,8.0,70.000000,15.000000,unknown,46.40
4,02-01-2023,London,6.0,75.000000,20.000000,snowy,42.80
...,...,...,...,...,...,...,...
95,01-01-2023,London,8.4,58.978723,14.352727,rainy,47.12
96,09-01-2023,London,8.4,58.978723,14.352727,rainy,47.12
97,11-01-2023,Tokyo,8.4,58.978723,14.352727,sunny,47.12
98,15-01-2023,New York,8.4,41.000000,24.300000,unknown,47.12


In [28]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')
df['date'] = df['date'].dt.strftime('%y-%m-%d')
df

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition,temperature_fahrenheit
0,23-01-01,New York,5.0,60.000000,10.000000,sunny,41.00
1,23-02-01,New York,8.4,65.000000,12.000000,cloudy,47.12
2,23-01-03,New York,7.0,58.978723,8.000000,rainy,44.60
3,23-01-14,London,8.0,70.000000,15.000000,unknown,46.40
4,23-01-02,London,6.0,75.000000,20.000000,snowy,42.80
...,...,...,...,...,...,...,...
95,23-01-01,London,8.4,58.978723,14.352727,rainy,47.12
96,23-01-09,London,8.4,58.978723,14.352727,rainy,47.12
97,23-01-11,Tokyo,8.4,58.978723,14.352727,sunny,47.12
98,23-01-15,New York,8.4,41.000000,24.300000,unknown,47.12


In [33]:
df = df[(df['weather_condition'].str.lower() != 'unknown')]

df

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition,temperature_fahrenheit
0,23-01-01,New York,5.0,60.0,10.0,sunny,41.0
1,23-02-01,New York,8.4,65.0,12.0,cloudy,47.12
2,23-01-03,New York,7.0,58.978723,8.0,rainy,44.6
4,23-01-02,London,6.0,75.0,20.0,snowy,42.8
5,23-03-01,London,8.4,80.0,18.0,cloudy,47.12
6,23-01-01,Tokyo,10.0,50.0,5.0,sunny,50.0
7,23-02-01,Tokyo,12.0,55.0,6.0,cloudy,53.6
9,23-01-25,New York,8.4,51.0,20.3,sunny,47.12
12,23-07-01,New York,8.4,58.978723,17.9,snowy,47.12
14,23-01-15,London,8.4,56.0,14.352727,rainy,47.12


In [32]:
os.makedirs('outputs', exist_ok=True)
df.to_csv('outputs/transformed_weather_data.csv', index=False)

In [38]:
df.loc[:, 'temperature_celsius'] = pd.to_numeric(df['temperature_celsius'],  errors='coerce')
top5_highest = df.nlargest(5, 'temperature_celsius')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'temperature_celsius'] = pd.to_numeric(df['temperature_celsius'],  errors='coerce')


In [39]:
top5_highest

Unnamed: 0,date,city,temperature_celsius,humidity_percent,wind_speed_kph,weather_condition,temperature_fahrenheit
21,23-01-17,Tokyo,19.3,43.0,14.352727,rainy,66.74
64,23-01-27,London,18.1,59.0,1.0,snowy,64.58
55,23-01-22,New York,13.4,58.978723,14.352727,snowy,56.12
34,23-01-25,New York,12.9,58.978723,5.7,sunny,55.22
70,23-01-15,New York,12.2,58.978723,16.5,rainy,53.96


In [41]:
with open('outputs/top5_highest_cities.txt', 'w') as file:
    file.write(top5_highest.to_string(index=False))
    