In [165]:
import pandas as pd
import numpy as np
import time
import os

In [166]:
weather_df = pd.read_csv('weather_data.csv')
cleaned_weather_df = weather_df

# Uncomment any of the lines to see shape, dtypes, and head, respectively
# weather_df.shape
# weather_df.dtypes
weather_df.head()


Unnamed: 0,date,high,low,precipitation,condition,real feel,real feel shade,max uv index,wind
0,7/24,21°,/19°,100%,"Cloudy with showers and thunderstorms, some he...",24°,21°,4 Moderate,WSW 11 km/h
1,7/25,21°,/19°,100%,Cloudy with showers and thunderstorms,24°,21°,4 Moderate,WSW 11 km/h
2,7/26,23°,/19°,96%,A thick cloud cover with a couple of showers a...,28°,24°,5 Moderate,SW 9 km/h
3,7/27,23°,/19°,97%,Considerable cloudiness and humid with a coupl...,28°,24°,4 Moderate,WSW 7 km/h
4,7/28,20°,/19°,95%,"Thunderstorms, some can be heavy late; cloudy",24°,22°,4 Moderate,SW 6 km/h


Data needs to be formatted properly since there's a mix of data types for each col. Thus, 'object' for each col.

In [167]:
# Check for null values
weather_df.isnull().sum()

date               0
high               0
low                0
precipitation      0
condition          0
real feel          0
real feel shade    0
max uv index       0
wind               0
dtype: int64

### Convert each col to its proper format and remove unecessary chars

In [168]:
# Convert appropriate columns to numeric
to_num_cols = ['high', 'low', 'real feel', 'real feel shade', 'max uv index', 'wind']

for col in to_num_cols:
    cleaned_weather_df[col] = cleaned_weather_df[col].str.extract('(\d+)', expand=False)
    cleaned_weather_df[col] = pd.to_numeric(cleaned_weather_df[col])

cleaned_weather_df['condition'] = cleaned_weather_df['condition'].str.lower()

cleaned_weather_df['precipitation'] = cleaned_weather_df['precipitation'].str.extract('(\d+)', expand=False)
cleaned_weather_df['precipitation'] = pd.to_numeric(cleaned_weather_df['precipitation']) * 0.01
cleaned_weather_df['precipitation'] = cleaned_weather_df['precipitation'].round(2)

# Check the data types again
print(cleaned_weather_df.dtypes)
cleaned_weather_df.head()

date                object
high                 int64
low                  int64
precipitation      float64
condition           object
real feel            int64
real feel shade      int64
max uv index         int64
wind                 int64
dtype: object


Unnamed: 0,date,high,low,precipitation,condition,real feel,real feel shade,max uv index,wind
0,7/24,21,19,1.0,"cloudy with showers and thunderstorms, some he...",24,21,4,11
1,7/25,21,19,1.0,cloudy with showers and thunderstorms,24,21,4,11
2,7/26,23,19,0.96,a thick cloud cover with a couple of showers a...,28,24,5,9
3,7/27,23,19,0.97,considerable cloudiness and humid with a coupl...,28,24,4,7
4,7/28,20,19,0.95,"thunderstorms, some can be heavy late; cloudy",24,22,4,6


### Clean Date format

In [169]:
# Convert to datetime with the correct format
cleaned_weather_df['date'] = pd.to_datetime(cleaned_weather_df['date'], format='%m/%d')

year = time.strftime('%Y')

# Add the year to the date
cleaned_weather_df['date'] = cleaned_weather_df['date'].apply(lambda x: x.replace(year=int(year)))

cleaned_weather_df.head()

Unnamed: 0,date,high,low,precipitation,condition,real feel,real feel shade,max uv index,wind
0,2024-07-24,21,19,1.0,"cloudy with showers and thunderstorms, some he...",24,21,4,11
1,2024-07-25,21,19,1.0,cloudy with showers and thunderstorms,24,21,4,11
2,2024-07-26,23,19,0.96,a thick cloud cover with a couple of showers a...,28,24,5,9
3,2024-07-27,23,19,0.97,considerable cloudiness and humid with a coupl...,28,24,4,7
4,2024-07-28,20,19,0.95,"thunderstorms, some can be heavy late; cloudy",24,22,4,6


In [170]:
# Export cleaned df to CSV
cleaned_weather_df.drop('condition', axis=1, inplace=True)
cleaned_weather_df.to_csv('weather_data_clean.csv', index=False)