In [2]:
import os
from typing import Sequence
from math import sqrt

import pandas as pd

In [None]:
# Creating a path to the csv file.
csv_weatherdata_path = os.path.join("assets", "weatherdata.csv")

# Storing the csv to a DF
df_weather = pd.read_csv(csv_weatherdata_path)
df_weather['Date'] = pd.to_datetime(df_weather['Date'])                  # converting the date column to a datetimeindex
df_weather['Year'] = pd.DatetimeIndex(df_weather['Date']).year           # adding a year column
df_weather['month'] = pd.DatetimeIndex(df_weather['Date']).month         # adding a month column
df_weather['Day of month'] = pd.DatetimeIndex(df_weather['Date']).day    # addign a day of month column

df_weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,Year,month,Day of month
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,2008,12,1
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,2008,12,2
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,2008,12,3
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,2008,12,4
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,2008,12,5


## #Grouping and Aggrigation

The df.groupby() function in the pandas library is a powerful tool for performing group-wise operations on data. It allows you to split your data into groups based on some criteria, apply a function to each group independently, and then combine the results.<br><br><br>

**Syntax**:<br>
``df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=NoDefault.no_default, observed=False, dropna=True)``

<br>

**Common Parameters**:
- **by**: Used to specify the column or columns to group by.
- **axis**: Split along rows (axis=0, default) or columns (axis=1).
- **level**: If the axis is a MultiIndex (hierarchical), group by a particular level or levels.
- **as_index**: If True, the group labels are used as the index.
- **sort**: Sort the group keys.
- **dropna**: If True, and if the grouping key is NA, remove the NA from group keys.

**Question)** Find the location which received the most amount of rain in the given data. In this place, certain promotional offers can be put in place to boost sales of tea, umbrella etc.<br><br>
**Note:** Pandas is not able to perform numerical aggrigation functions on string columns. Therefore, we'll need to pass `True` to the `'numeric_only'` kwarg inside the numeric aggrigator.

In [None]:
# Creating a groupby object where each group will be separated on the basis of its location value.
# i.e. in each group, the location value will be the same.

df_weather.groupby(by='Location')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015A36A61B50>

## Aggrigate function

An aggrigation function can be applied on a .groupby() object to combine the groups into a dataframe with aggrigate values<br>
This aggrigate function can me mathematical such as .mean(), .sum(), or a custom function using the .apply() funciton.

In [None]:
# Grouping weather data by location, and then aggrigating the groups by finding their mean.

df_weather.groupby(by='Location').mean(numeric_only=True).head(10)

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Year,month,Day of month
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adelaide,12.628368,22.945402,1.572185,5.824924,7.752002,36.530812,2012.52589,6.523948,15.740453
Albany,12.948461,20.072587,2.255073,4.207273,6.658765,,2012.708554,6.41313,15.680371
Albury,9.520899,22.630963,1.92571,,,32.953016,2012.733643,6.412488,15.745932
AliceSprings,13.125182,29.244191,0.869355,9.029929,9.581944,40.533714,2012.719565,6.407456,15.689211
BadgerysCreek,11.1369,24.023111,2.207925,,,33.60989,2012.790984,6.326161,15.769467
Ballarat,7.355302,18.274794,1.68883,,,44.978695,2012.732166,6.422061,15.727543
Bendigo,8.591065,21.616683,1.621452,3.85173,,38.849283,2012.723138,6.415953,15.723467
Brisbane,16.410998,26.441527,3.160536,5.408848,8.082782,28.21138,2012.496678,6.556786,15.719076
Cairns,21.199197,29.544344,5.765317,6.211976,7.575995,38.067991,2012.677376,6.363454,15.720214
Canberra,6.827688,20.980644,1.735038,4.404717,7.403241,40.082174,2012.164131,6.461966,15.736396


In [None]:
# Storing the aggrigated DF into df_weather_loc variable.

df_weather_loc = df_weather.groupby(by='Location').mean(numeric_only=True)

df_weather_loc.head()

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Year,month,Day of month
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adelaide,12.628368,22.945402,1.572185,5.824924,7.752002,36.530812,2012.52589,6.523948,15.740453
Albany,12.948461,20.072587,2.255073,4.207273,6.658765,,2012.708554,6.41313,15.680371
Albury,9.520899,22.630963,1.92571,,,32.953016,2012.733643,6.412488,15.745932
AliceSprings,13.125182,29.244191,0.869355,9.029929,9.581944,40.533714,2012.719565,6.407456,15.689211
BadgerysCreek,11.1369,24.023111,2.207925,,,33.60989,2012.790984,6.326161,15.769467


In [None]:
# Sorting values by values in a column.

df_weather_loc.sort_values('Rainfall', ascending=True).head()

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Year,month,Day of month
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Woomera,13.313898,26.542135,0.489946,10.116654,9.063005,44.080962,2012.773579,6.344816,15.692308
Uluru,14.406983,30.387442,0.707324,,,41.369231,2014.852728,6.447074,15.712689
AliceSprings,13.125182,29.244191,0.869355,9.029929,9.581944,40.533714,2012.719565,6.407456,15.689211
Nhil,8.992798,22.398407,0.932907,,,42.542438,2014.838751,6.37731,15.729127
Mildura,10.733954,24.841536,0.945025,5.935952,8.468104,37.170829,2012.778849,6.354506,15.70868


**Question)** Hot chocolate is the most sold product in the cold months. Find month which is the coldest so that the inventory team can keep the stock of hot chocolate ready well in advance.

In [None]:
# Grouping all rows by their month.
# Then finding the average temp for that month.

df_weather_month = df_weather.groupby(by='month').mean(numeric_only=True)
df_weather_month.head()

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Year,Day of month
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,17.520778,29.547362,2.719036,8.773171,9.208942,43.36173,2013.042721,15.986688
2,17.500239,28.877704,3.174075,7.651018,8.607494,41.457472,2013.054822,14.643515
3,15.904347,26.886744,2.801304,6.237989,7.646279,39.546399,2013.024778,15.995321
4,12.831979,23.611845,2.314764,4.547511,7.107208,36.460285,2013.279055,15.492659
5,9.618572,20.047202,1.978896,3.244134,6.337496,35.721056,2013.040214,15.991038


In [19]:
df_weather_month.sort_values('MinTemp')

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Year,Day of month
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7,6.951308,16.764242,2.179314,2.699269,6.06979,37.891458,2012.467867,16.001528
8,7.465145,18.25893,2.02961,3.616533,7.171661,40.245052,2012.473474,16.022275
6,7.815031,17.324778,2.781114,2.518705,5.660379,35.506375,2012.975381,15.257648
9,9.460189,20.77251,1.875851,4.917265,7.69877,42.213311,2012.461084,15.518378
5,9.618572,20.047202,1.978896,3.244134,6.337496,35.721056,2013.040214,15.991038
10,11.531145,23.540695,1.610734,6.379571,8.50008,42.716694,2012.462725,16.026771
4,12.831979,23.611845,2.314764,4.547511,7.107208,36.460285,2013.279055,15.492659
11,14.299624,26.165571,2.273758,7.465236,8.685394,42.582385,2012.435041,15.498211
12,15.771514,27.52639,2.476483,8.046298,8.975372,43.004769,2012.286401,15.969103
3,15.904347,26.886744,2.801304,6.237989,7.646279,39.546399,2013.024778,15.995321


**^** July is the coldest month<br><br>
<br><br>
**Question**:
Sometimes feeling cold is more than about low temperatures; a windy day can also make you cold.<br>A factor called the chill factor can be used to quantify the cold based on the wind speed and the temperature.<br><br>The formula for the chill factor is given by 

$ WCI = (10 * \sqrt{v} - v + 10.5) .(33 - T_{m}) $

v is the speed of the wind and $ T_{m} $ is the minimum temperature

Add a column for WCI and find the month with the lowest WCI. 

In [23]:
def wci(x:Sequence):      # x is a row in a DF
    velocity = x['WindGustSpeed']
    minTemp = x['MinTemp']
    
    return (10*sqrt(velocity)-velocity+10.5)*(33-minTemp)

In [None]:


df_weather_month['WCI'] = df_weather_month.apply(wci, axis=1)
df_weather_month.head()

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Year,Day of month,WCI
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,17.520778,29.547362,2.719036,8.773171,9.208942,43.36173,2013.042721,15.986688,510.626939
2,17.500239,28.877704,3.174075,7.651018,8.607494,41.457472,2013.054822,14.643515,518.157101
3,15.904347,26.886744,2.801304,6.237989,7.646279,39.546399,2013.024778,15.995321,578.508833
4,12.831979,23.611845,2.314764,4.547511,7.107208,36.460285,2013.279055,15.492659,694.224982
5,9.618572,20.047202,1.978896,3.244134,6.337496,35.721056,2013.040214,15.991038,807.735713


In [25]:
df_weather_month.sort_values('WCI', ascending=False)

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Year,Day of month,WCI
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7,6.951308,16.764242,2.179314,2.699269,6.06979,37.891458,2012.467867,16.001528,889.942624
6,7.815031,17.324778,2.781114,2.518705,5.660379,35.506375,2012.975381,15.257648,870.917665
8,7.465145,18.25893,2.02961,3.616533,7.171661,40.245052,2012.473474,16.022275,860.369753
5,9.618572,20.047202,1.978896,3.244134,6.337496,35.721056,2013.040214,15.991038,807.735713
9,9.460189,20.77251,1.875851,4.917265,7.69877,42.213311,2012.461084,15.518378,782.897866
10,11.531145,23.540695,1.610734,6.379571,8.50008,42.716694,2012.462725,16.026771,711.506091
4,12.831979,23.611845,2.314764,4.547511,7.107208,36.460285,2013.279055,15.492659,694.224982
11,14.299624,26.165571,2.273758,7.465236,8.685394,42.582385,2012.435041,15.498211,620.343747
3,15.904347,26.886744,2.801304,6.237989,7.646279,39.546399,2013.024778,15.995321,578.508833
12,15.771514,27.52639,2.476483,8.046298,8.975372,43.004769,2012.286401,15.969103,569.802085


**^** According to WCI July is the coldest month