In [21]:
import pandas as pd

In [22]:
# Import Raw Datasets
amarvati_raw = pd.read_csv('Amaravati_raw.csv')
amritsar_raw = pd.read_csv('Amritsar_raw.csv')
jaipur_raw = pd.read_csv('Jaipur_raw.csv')

In [23]:
amritsar_raw.columns

Index(['Unnamed: 0', 'date', 'temperature_2m', 'relative_humidity_2m',
       'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain',
       'snowfall', 'snow_depth', 'pressure_msl', 'surface_pressure',
       'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
       'wind_speed_10m', 'wind_speed_100m', 'wind_direction_10m',
       'wind_direction_100m', 'wind_gusts_10m'],
      dtype='object')

In [24]:
# Drop unneccesary columns
columns_to_drop = ['Unnamed: 0', 'apparent_temperature', 'snowfall', 'snow_depth', 'pressure_msl', 'surface_pressure', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_direction_10m', 'wind_direction_100m', 'wind_gusts_10m']
amarvati_raw.drop(columns_to_drop, axis=1, inplace=True)
amritsar_raw.drop(columns_to_drop, axis=1, inplace=True)
jaipur_raw.drop(columns_to_drop, axis=1, inplace=True)

In [25]:
amarvati_raw.head(2)

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,cloud_cover,wind_speed_10m,wind_speed_100m
0,2010-01-01 00:00:00+00:00,21.0325,94.6037,20.1325,0.0,0.0,24.9,5.804825,6.638072
1,2010-01-01 01:00:00+00:00,20.682499,95.47321,19.932499,0.0,0.0,6.3,5.904439,6.130579


In [26]:
# Check time column data type
print(type(amarvati_raw['date'][0]))

# Convert to date type
amarvati_raw['date'] = pd.to_datetime(amarvati_raw['date'])
amritsar_raw['date'] = pd.to_datetime(amarvati_raw['date'])
jaipur_raw['date'] = pd.to_datetime(amarvati_raw['date'])

print(type(amarvati_raw['date'][0]))

<class 'str'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [27]:
# Rename columns so easier to read
amarvati_raw.rename(columns={'temperature_2m': 'temperature', 'relative_humidity_2m': 'humidity', 'dew_point_2m': 'dew_point'}, inplace=True)
amritsar_raw.rename(columns={'temperature_2m': 'temperature', 'relative_humidity_2m': 'humidity', 'dew_point_2m': 'dew_point'}, inplace=True)
jaipur_raw.rename(columns={'temperature_2m': 'temperature', 'relative_humidity_2m': 'humidity', 'dew_point_2m': 'dew_point'}, inplace=True)

Data available is per hour which does not match the data on the pollutants. 

Feature Engineering is needed to sum, average or find the min/max of each type of weather

Type:
- Temperature : Average + Max
- Humidity: Average + Max
- Dew Point: Average + Max
- Precipitation: Sum
- Rain: Sum
- Cloud Cover: Sum
- Wind Speed 10m: Average + Max
- Wind Speed 100m: Average + Max

In [28]:
# Set date as the index
amarvati_raw.set_index('date', inplace=True)
amritsar_raw.set_index('date', inplace=True)
jaipur_raw.set_index('date', inplace=True)

In [29]:
# Define the aggregation functions for each column
amarvati = amarvati_raw.resample('D').agg({
    'temperature': ['mean', 'max'],      
    'humidity': ['mean', 'max'],          
    'dew_point': ['mean', 'max'],         
    'precipitation': 'sum',              
    'rain': 'sum',                     
    'cloud_cover': 'sum',                
    'wind_speed_10m': ['mean', 'max'],     
    'wind_speed_100m': ['mean', 'max'] 
})

amritsar = amritsar_raw.resample('D').agg({
    'temperature': ['mean', 'max'],        
    'humidity': ['mean', 'max'],           
    'dew_point': ['mean', 'max'],          
    'precipitation': 'sum',               
    'rain': 'sum',                         
    'cloud_cover': 'sum',                   
    'wind_speed_10m': ['mean', 'max'],     
    'wind_speed_100m': ['mean', 'max']      
})

jaipur = jaipur_raw.resample('D').agg({
    'temperature': ['mean', 'max'],        
    'humidity': ['mean', 'max'],            
    'dew_point': ['mean', 'max'],          
    'precipitation': 'sum',                 
    'rain': 'sum',                          
    'cloud_cover': 'sum',                  
    'wind_speed_10m': ['mean', 'max'],      
    'wind_speed_100m': ['mean', 'max']     
})

# Display the aggregated daily values
amarvati.columns = ['_'.join(col).strip() for col in amarvati.columns.values]
amritsar.columns = ['_'.join(col).strip() for col in amritsar.columns.values]
jaipur.columns = ['_'.join(col).strip() for col in jaipur.columns.values]


In [30]:
# Export dataframes as csv for use in main
amarvati.to_csv('amarvati.csv', index=True)
amritsar.to_csv('amritsar.csv', index=True)
jaipur.to_csv('jaipur.csv', index=True)