In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../Dataset/weatherAUS.csv")

In [4]:
df = df.loc[:, ['Sunshine', 'Cloud9am', 'Cloud3pm']]

In [5]:
df.head()

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
0,,8.0,
1,,,
2,,,2.0
3,,,
4,,7.0,8.0


In [6]:
df_original = df.dropna()
df_original

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
5939,12.3,2.0,5.0
5940,13.0,1.0,1.0
5942,10.6,1.0,6.0
5943,12.2,1.0,5.0
5944,8.4,1.0,6.0
...,...,...,...
139108,11.0,0.0,1.0
139109,8.6,7.0,0.0
139110,11.0,0.0,0.0
139111,10.6,1.0,1.0


In [7]:
# Drop rows with atleast 2 NaN values

df = df.dropna(thresh=2)

In [8]:
# Extracting rows in which Sunshine is NaN

df_fill_sunshine = df[df['Sunshine'].isnull()]
df_fill_sunshine

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
4,,7.0,8.0
11,,8.0,8.0
12,,8.0,8.0
15,,8.0,1.0
16,,8.0,1.0
...,...,...,...
142162,,7.0,1.0
142163,,4.0,7.0
142166,,8.0,8.0
142167,,8.0,4.0


In [9]:
# Extracting rows in which Cloud9am is NaN

df_fill_cloud9am = df[df['Cloud9am'].isnull()]
df_fill_cloud9am

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
5941,13.3,,1.0
6011,11.5,,3.0
6099,10.0,,4.0
6434,10.0,,0.0
12851,2.8,,7.0
...,...,...,...
134650,0.0,,8.0
134754,2.0,,7.0
135368,10.4,,5.0
135395,12.3,,4.0


In [10]:
# Extracting rows in which Cloud3pm is NaN

df_fill_cloud3pm = df[df['Cloud3pm'].isnull()]
df_fill_cloud3pm

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
6118,7.1,5.0,
6190,11.3,0.0,
6428,10.3,1.0,
9057,9.0,3.0,
9432,9.1,1.0,
...,...,...,...
135332,12.9,0.0,
135404,12.2,7.0,
135448,12.0,0.0,
136259,9.2,7.0,


In [11]:
# Creating a Linear Regression model to predict Sunshine which will be used to fill NaN values

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = df_original.loc[:, ['Cloud9am', 'Cloud3pm']]
y = df_original.loc[:, 'Sunshine']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
model_sunshine = LinearRegression()

In [13]:
model_sunshine.fit(X_train, y_train)

In [14]:
# Using model to fill Nan values in Sunshine

df_fill_sunshine.loc[:, 'Sunshine'] = model_sunshine.predict(df_fill_sunshine.loc[:, ['Cloud9am', 'Cloud3pm']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fill_sunshine.loc[:, 'Sunshine'] = model_sunshine.predict(df_fill_sunshine.loc[:, ['Cloud9am', 'Cloud3pm']])


In [15]:
df_fill_sunshine

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
4,3.806472,7.0,8.0
11,3.277427,8.0,8.0
12,3.277427,8.0,8.0
15,7.861042,8.0,1.0
16,7.861042,8.0,1.0
...,...,...,...
142162,8.390088,7.0,1.0
142163,6.048411,4.0,7.0
142166,3.277427,8.0,8.0
142167,5.896636,8.0,4.0


In [16]:
# Replace the rows in df with rows in df_fill_sunshine only at indexes of df_fill_sunshine

df.loc[df_fill_sunshine.index, 'Sunshine'] = df_fill_sunshine.loc[:, 'Sunshine']
df

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
4,3.806472,7.0,8.0
11,3.277427,8.0,8.0
12,3.277427,8.0,8.0
15,7.861042,8.0,1.0
16,7.861042,8.0,1.0
...,...,...,...
142162,8.390088,7.0,1.0
142163,6.048411,4.0,7.0
142166,3.277427,8.0,8.0
142167,5.896636,8.0,4.0


In [17]:
# Creating a Linear Regression model to predict Cloud9am which will be used to fill NaN values

X = df_original.loc[:, ['Sunshine', 'Cloud3pm']]
y = df_original.loc[:, 'Cloud9am']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_cloud9am = LinearRegression()

model_cloud9am.fit(X_train, y_train)

In [18]:
# Using model to fill Nan values in Cloud9am

df_fill_cloud9am.loc[:, 'Cloud9am'] = model_cloud9am.predict(df_fill_cloud9am.loc[:, ['Sunshine', 'Cloud3pm']])
df_fill_cloud9am

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fill_cloud9am.loc[:, 'Cloud9am'] = model_cloud9am.predict(df_fill_cloud9am.loc[:, ['Sunshine', 'Cloud3pm']])


Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
5941,13.3,1.265277,1.0
6011,11.5,2.486738,3.0
6099,10.0,3.317072,4.0
6434,10.0,2.191774,0.0
12851,2.8,6.796293,7.0
...,...,...,...
134650,0.0,8.102436,8.0
134754,2.0,7.089099,7.0
135368,10.4,3.451994,5.0
135395,12.3,2.475257,4.0


In [19]:
# Replace the rows in df with rows in df_fill_cloud9am only at indexes of df_fill_cloud9am

df.loc[df_fill_cloud9am.index, 'Cloud9am'] = df_fill_cloud9am.loc[:, 'Cloud9am']
df

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
4,3.806472,7.0,8.0
11,3.277427,8.0,8.0
12,3.277427,8.0,8.0
15,7.861042,8.0,1.0
16,7.861042,8.0,1.0
...,...,...,...
142162,8.390088,7.0,1.0
142163,6.048411,4.0,7.0
142166,3.277427,8.0,8.0
142167,5.896636,8.0,4.0


In [20]:
# Creating a Linear Regression model to predict Cloud3pm which will be used to fill NaN values

X = df_original.loc[:, ['Sunshine', 'Cloud9am']]
y = df_original.loc[:, 'Cloud3pm']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_cloud3pm = LinearRegression()

model_cloud3pm.fit(X_train, y_train)

In [21]:
# Using model to fill Nan values in Cloud3pm

df_fill_cloud3pm.loc[:, 'Cloud3pm'] = model_cloud3pm.predict(df_fill_cloud3pm.loc[:, ['Sunshine', 'Cloud9am']])
df_fill_cloud3pm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fill_cloud3pm.loc[:, 'Cloud3pm'] = model_cloud3pm.predict(df_fill_cloud3pm.loc[:, ['Sunshine', 'Cloud9am']])


Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
6118,7.1,5.0,4.714002
6190,11.3,0.0,1.945972
6428,10.3,1.0,2.559865
9057,9.0,3.0,3.523896
9432,9.1,1.0,3.012014
...,...,...,...
135332,12.9,0.0,1.343105
135404,12.2,7.0,3.266567
135448,12.0,0.0,1.682218
136259,9.2,7.0,4.396942


In [22]:
# Replace the rows in df with rows in df_fill_cloud3pm only at indexes of df_fill_cloud3pm

df.loc[df_fill_cloud3pm.index, 'Cloud3pm'] = df_fill_cloud3pm.loc[:, 'Cloud3pm']
df

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
4,3.806472,7.0,8.0
11,3.277427,8.0,8.0
12,3.277427,8.0,8.0
15,7.861042,8.0,1.0
16,7.861042,8.0,1.0
...,...,...,...
142162,8.390088,7.0,1.0
142163,6.048411,4.0,7.0
142166,3.277427,8.0,8.0
142167,5.896636,8.0,4.0


In [23]:
# Checking for NaN values

df.isnull().sum()

Sunshine    0
Cloud9am    0
Cloud3pm    0
dtype: int64

In [24]:
df.describe()

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
count,83659.0,83659.0,83659.0
mean,7.465358,4.434595,4.494955
std,3.652684,2.873674,2.696974
min,0.0,0.0,0.0
25%,4.461275,1.0,2.0
50%,8.0,5.0,5.0
75%,10.5,7.0,7.0
max,14.5,9.0,9.0


In [25]:
# Saving df as a csv file along with the index

# df.to_csv("../Dataset/weatherAUS_cleaned.csv", index=True)

In [26]:
dat = pd.read_csv("../Dataset/weatherAUS.csv")
dat

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,2017-06-20,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,...,27.0,1024.7,1021.2,,,9.4,20.9,No,0.0,No
142189,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,24.0,1024.6,1020.3,,,10.1,22.4,No,0.0,No
142190,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,21.0,1023.5,1019.1,,,10.9,24.5,No,0.0,No
142191,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,24.0,1021.0,1016.8,,,12.5,26.1,No,0.0,No


In [27]:
df

Unnamed: 0,Sunshine,Cloud9am,Cloud3pm
4,3.806472,7.0,8.0
11,3.277427,8.0,8.0
12,3.277427,8.0,8.0
15,7.861042,8.0,1.0
16,7.861042,8.0,1.0
...,...,...,...
142162,8.390088,7.0,1.0
142163,6.048411,4.0,7.0
142166,3.277427,8.0,8.0
142167,5.896636,8.0,4.0


In [30]:
# Replacing values of Cloud9am and Cloud3pm in dat with values in df at the same index

dat.loc[df.index, 'Cloud9am'] = df.loc[:, 'Cloud9am']
dat.loc[df.index, 'Cloud3pm'] = df.loc[:, 'Cloud3pm']
dat

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,3.806472,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,2017-06-20,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,...,27.0,1024.7,1021.2,,,9.4,20.9,No,0.0,No
142189,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,24.0,1024.6,1020.3,,,10.1,22.4,No,0.0,No
142190,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,21.0,1023.5,1019.1,,,10.9,24.5,No,0.0,No
142191,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,24.0,1021.0,1016.8,,,12.5,26.1,No,0.0,No


In [28]:
# Replacing values of sunshine in dat with values in df at the same index

dat.loc[df.index, 'Sunshine'] = df.loc[:, 'Sunshine']
dat

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,3.806472,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,2017-06-20,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,...,27.0,1024.7,1021.2,,,9.4,20.9,No,0.0,No
142189,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,24.0,1024.6,1020.3,,,10.1,22.4,No,0.0,No
142190,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,21.0,1023.5,1019.1,,,10.9,24.5,No,0.0,No
142191,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,24.0,1021.0,1016.8,,,12.5,26.1,No,0.0,No


In [31]:
dat.isnull().sum()

Date                 0
Location             0
MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         51113
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53180
Cloud3pm         55031
Temp9am            904
Temp3pm           2726
RainToday         1406
RISK_MM              0
RainTomorrow         0
dtype: int64

In [None]:
# Saving dat as a csv file replacing the original file

dat.to_csv("../Dataset/weatherAUS.csv", index=False)