In [None]:
from google.colab import files
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
uploaded = files.upload()
filename = next(iter(uploaded))

Saving WeatherData.csv to WeatherData (1).csv


In [None]:
df = pd.read_csv(filename)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          9 non-null      object 
 1   temperature  5 non-null      float64
 2   windspeed    5 non-null      float64
 3   event        7 non-null      object 
dtypes: float64(2), object(2)
memory usage: 420.0+ bytes


In [None]:
df.head()

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2021,32.0,6.0,Rain
1,01-04-2021,,9.0,Sunny
2,01-05-2021,28.0,,Snow
3,01-06-2021,,7.0,
4,01-07-2021,32.0,,Rain


### Now replace NaN value with some other value.

In [None]:
df.fillna(0)

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2021,32.0,6.0,Rain
1,01-04-2021,0.0,9.0,Sunny
2,01-05-2021,28.0,0.0,Snow
3,01-06-2021,0.0,7.0,0
4,01-07-2021,32.0,0.0,Rain
5,01-08-2021,0.0,0.0,Sunny
6,01-09-2021,0.0,0.0,0
7,01-10-2021,34.0,8.0,Cloudy
8,01-11-2021,40.0,12.0,Sunny


### Handling Missing Values

In [None]:
newdf = df.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'No Event'
})

In [None]:
newdf

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2021,32.0,6.0,Rain
1,01-04-2021,0.0,9.0,Sunny
2,01-05-2021,28.0,0.0,Snow
3,01-06-2021,0.0,7.0,No Event
4,01-07-2021,32.0,0.0,Rain
5,01-08-2021,0.0,0.0,Sunny
6,01-09-2021,0.0,0.0,No Event
7,01-10-2021,34.0,8.0,Cloudy
8,01-11-2021,40.0,12.0,Sunny


### But still there is something which looks weird The temperature on 1st Jan is 32 degree fahrenheit abnd suddenly on 4th it is 0.

### So, for better estimation we can just carry forward the previous values.

In [None]:
newdf = df.fillna(method='ffill')
newdf

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2021,32.0,6.0,Rain
1,01-04-2021,32.0,9.0,Sunny
2,01-05-2021,28.0,9.0,Snow
3,01-06-2021,28.0,7.0,Snow
4,01-07-2021,32.0,7.0,Rain
5,01-08-2021,32.0,7.0,Sunny
6,01-09-2021,32.0,7.0,Sunny
7,01-10-2021,34.0,8.0,Cloudy
8,01-11-2021,40.0,12.0,Sunny


### To copy next value into previous position ... we can use bfill (backwardfill) as value for method

In [None]:
newdf = df.fillna(method='bfill')
newdf

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2021,32.0,6.0,Rain
1,01-04-2021,28.0,9.0,Sunny
2,01-05-2021,28.0,7.0,Snow
3,01-06-2021,32.0,7.0,Rain
4,01-07-2021,32.0,8.0,Rain
5,01-08-2021,34.0,8.0,Sunny
6,01-09-2021,34.0,8.0,Cloudy
7,01-10-2021,34.0,8.0,Cloudy
8,01-11-2021,40.0,12.0,Sunny


### Temp on 1st Jan is 32
### On 5th Jan it is 28

### Then obviously temp will be in between 32 & 28 on 4th Jan

In [None]:
newdf = df.interpolate()
newdf

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2021,32.0,6.0,Rain
1,01-04-2021,30.0,9.0,Sunny
2,01-05-2021,28.0,8.0,Snow
3,01-06-2021,30.0,7.0,
4,01-07-2021,32.0,7.25,Rain
5,01-08-2021,32.666667,7.5,Sunny
6,01-09-2021,33.333333,7.75,
7,01-10-2021,34.0,8.0,Cloudy
8,01-11-2021,40.0,12.0,Sunny


### If we want to drop those rows that are having NaN Values

In [None]:
newdf = df.dropna()
newdf

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2021,32.0,6.0,Rain
7,01-10-2021,34.0,8.0,Cloudy
8,01-11-2021,40.0,12.0,Sunny


## Outlier Analysis

### Z-Score for outlier Detection

In [None]:
import numpy as np

In [None]:
data = [11,10,12,14,13,22,15,10,13,120,15,10,14,20,12,10,11,111,20,21,13,10,11,25,11,16,13,109,17,12,15,11,14,10]

In [None]:
mean = np.mean(data)
std = np.std(data)

In [None]:
print('Mean of the dataset if: ', mean)
print('Standard Deviation of the dataset if: ', std)

Mean of the dataset if:  22.676470588235293
Standard Deviation of the dataset if:  28.48603433700688


In [None]:
updatedData = []
outlier = []

for i in data:
  zscr = (i - mean) / std
  updatedData.append({i: zscr})
  if zscr > 3:
    outlier.append({i: zscr})

In [None]:
updatedData

[{11: -0.4099015837057429},
 {10: -0.4450065052321794},
 {12: -0.3747966621793065},
 {14: -0.30458681912643365},
 {13: -0.3396917406528701},
 {22: -0.023747446914942266},
 {15: -0.26948189759999724},
 {10: -0.4450065052321794},
 {13: -0.3396917406528701},
 {120: 3.4165348626758276},
 {15: -0.26948189759999724},
 {10: -0.4450065052321794},
 {14: -0.30458681912643365},
 {20: -0.09395728996781512},
 {12: -0.3747966621793065},
 {10: -0.4450065052321794},
 {11: -0.4099015837057429},
 {111: 3.1005905689378994},
 {20: -0.09395728996781512},
 {21: -0.05885236844137869},
 {13: -0.3396917406528701},
 {10: -0.4450065052321794},
 {11: -0.4099015837057429},
 {25: 0.081567317664367},
 {11: -0.4099015837057429},
 {16: -0.2343769760735608},
 {13: -0.3396917406528701},
 {109: 3.030380725885027},
 {17: -0.19927205454712438},
 {12: -0.3747966621793065},
 {15: -0.26948189759999724},
 {11: -0.4099015837057429},
 {14: -0.30458681912643365},
 {10: -0.4450065052321794}]

In [None]:
outlier

[{120: 3.4165348626758276},
 {111: 3.1005905689378994},
 {109: 3.030380725885027}]