#**DATA PREPROCESSING/DATA PREPARATION**

1. Data Cleaning - drop, fillna with mean, median, mode Impuatation, changing dtype
2. Data Transformation
    * If data is CONTINUOUS = Standard Scaler, MinMaxScaler, Robust Scaler
    * If data is DISCRETE   = Label Encoder, One hot Encoder

## **Impoting Necessary Libraries**

In [36]:
import pandas as pd

## **Importing Data**

In [37]:
Weather_Data = pd.read_csv('data_clean.csv')
Weather_Data 

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,1,41.0,190.0,7.4,67,5,1,2010,67,S
1,2,36.0,118.0,8.0,72,5,2,2010,72,C
2,3,12.0,149.0,12.6,74,5,3,2010,74,PS
3,4,18.0,313.0,11.5,62,5,4,2010,62,S
4,5,,,14.3,56,5,5,2010,56,S
...,...,...,...,...,...,...,...,...,...,...
153,154,41.0,190.0,7.4,67,5,1,2010,67,C
154,155,30.0,193.0,6.9,70,9,26,2010,70,PS
155,156,,145.0,13.2,77,9,27,2010,77,S
156,157,14.0,191.0,14.3,75,9,28,2010,75,S


## **Initial Analysis**

In [38]:
Weather_Data.shape

(158, 10)

In [39]:
Weather_Data.dtypes

Unnamed: 0      int64
Ozone         float64
Solar.R       float64
Wind          float64
Temp C         object
Month          object
Day             int64
Year            int64
Temp            int64
Weather        object
dtype: object

In [40]:
Weather_Data.isna().sum()

Unnamed: 0     0
Ozone         38
Solar.R        7
Wind           0
Temp C         0
Month          0
Day            0
Year           0
Temp           0
Weather        3
dtype: int64

In [13]:
Weather_Data.head(50)

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,1,41.0,190.0,7.4,67,5,1,2010,67,S
1,2,36.0,118.0,8.0,72,5,2,2010,72,C
2,3,12.0,149.0,12.6,74,5,3,2010,74,PS
3,4,18.0,313.0,11.5,62,5,4,2010,62,S
4,5,,,14.3,56,5,5,2010,56,S
5,6,28.0,,14.9,66,5,6,2010,66,C
6,7,23.0,299.0,8.6,65,5,7,2010,65,PS
7,8,19.0,99.0,13.8,59,5,8,2010,59,C
8,9,8.0,19.0,20.1,61,5,9,2010,61,PS
9,10,,194.0,8.6,69,5,10,2010,69,S


In [14]:
Weather_Data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
count,158.0,120.0,151.0,158.0,158.0,158.0,158.0,158.0,158.0,155
unique,,,,,41.0,6.0,,,,3
top,,,,,81.0,9.0,,,,S
freq,,,,,11.0,34.0,,,,59
mean,79.5,41.583333,185.403974,9.957595,,,16.006329,2010.0,77.727848,
std,45.754781,32.620709,88.723103,3.511261,,,8.997166,0.0,9.377877,
min,1.0,1.0,7.0,1.7,,,1.0,2010.0,56.0,
25%,40.25,18.0,119.0,7.4,,,8.0,2010.0,72.0,
50%,79.5,30.5,197.0,9.7,,,16.0,2010.0,78.5,
75%,118.75,61.5,257.0,11.875,,,24.0,2010.0,84.0,


## **Data Preparation**

#### **A. Data Cleaning**

In [41]:


del Weather_Data['Unnamed: 0']

In [42]:
Weather_Data.head(20)

Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,41.0,190.0,7.4,67,5,1,2010,67,S
1,36.0,118.0,8.0,72,5,2,2010,72,C
2,12.0,149.0,12.6,74,5,3,2010,74,PS
3,18.0,313.0,11.5,62,5,4,2010,62,S
4,,,14.3,56,5,5,2010,56,S
5,28.0,,14.9,66,5,6,2010,66,C
6,23.0,299.0,8.6,65,5,7,2010,65,PS
7,19.0,99.0,13.8,59,5,8,2010,59,C
8,8.0,19.0,20.1,61,5,9,2010,61,PS
9,,194.0,8.6,69,5,10,2010,69,S


### Client approves to drop the Ozone feature since it has more number of NULL values.

In [43]:
del Weather_Data['Ozone']

In [44]:
Weather_Data.head(10)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67,5,1,2010,67,S
1,118.0,8.0,72,5,2,2010,72,C
2,149.0,12.6,74,5,3,2010,74,PS
3,313.0,11.5,62,5,4,2010,62,S
4,,14.3,56,5,5,2010,56,S
5,,14.9,66,5,6,2010,66,C
6,299.0,8.6,65,5,7,2010,65,PS
7,99.0,13.8,59,5,8,2010,59,C
8,19.0,20.1,61,5,9,2010,61,PS
9,194.0,8.6,69,5,10,2010,69,S


In [45]:
Weather_Data.isna().sum()

Solar.R    7
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

In [46]:
Weather_Data['Solar.R'].median()

197.0

The column Solar.R has 7 nan values, So we gonna replace it's median values over there.

In [47]:
Weather_Data['Solar.R'].fillna(value = 197.0,axis = 0,inplace = True)

In [48]:
Weather_Data.isna().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

In [49]:
Weather_Data.dtypes

Solar.R    float64
Wind       float64
Temp C      object
Month       object
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [50]:
Weather_Data['Temp C'] =  pd.to_numeric(arg = Weather_Data['Temp C'],errors='coerce')
Weather_Data['Month']  =  pd.to_numeric(arg = Weather_Data['Month'],errors = 'coerce')

In [51]:
Weather_Data.head(20)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67.0,5.0,1,2010,67,S
1,118.0,8.0,72.0,5.0,2,2010,72,C
2,149.0,12.6,74.0,5.0,3,2010,74,PS
3,313.0,11.5,62.0,5.0,4,2010,62,S
4,197.0,14.3,56.0,5.0,5,2010,56,S
5,197.0,14.9,66.0,5.0,6,2010,66,C
6,299.0,8.6,65.0,5.0,7,2010,65,PS
7,99.0,13.8,59.0,5.0,8,2010,59,C
8,19.0,20.1,61.0,5.0,9,2010,61,PS
9,194.0,8.6,69.0,5.0,10,2010,69,S


In [52]:
Weather_Data.dtypes

Solar.R    float64
Wind       float64
Temp C     float64
Month      float64
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

### Client approved to go with Mean imputation for Temp C.

In [53]:
Weather_Data['Temp C'].mean()

77.7515923566879

In [54]:
Weather_Data['Temp C'].fillna(value=77.8,axis = 0,inplace=True)

In [55]:
Weather_Data['Month'].fillna(value = 5,axis = 0, inplace = True )

In [56]:
Weather_Data.isna().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    3
dtype: int64

In [57]:
Weather_Data['Weather'].unique()

array(['S', 'C', 'PS', nan], dtype=object)

In [58]:
Weather_Data['Weather'].mode()

0    S
dtype: object

In [59]:
Weather_Data['Weather'].fillna(value='S',axis = 0,inplace = True)

In [60]:
Weather_Data.isna().sum()

Solar.R    0
Wind       0
Temp C     0
Month      0
Day        0
Year       0
Temp       0
Weather    0
dtype: int64

In [61]:
Weather_Data.dtypes

Solar.R    float64
Wind       float64
Temp C     float64
Month      float64
Day          int64
Year         int64
Temp         int64
Weather     object
dtype: object

In [62]:
Weather_Data.head(20)

Unnamed: 0,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,190.0,7.4,67.0,5.0,1,2010,67,S
1,118.0,8.0,72.0,5.0,2,2010,72,C
2,149.0,12.6,74.0,5.0,3,2010,74,PS
3,313.0,11.5,62.0,5.0,4,2010,62,S
4,197.0,14.3,56.0,5.0,5,2010,56,S
5,197.0,14.9,66.0,5.0,6,2010,66,C
6,299.0,8.6,65.0,5.0,7,2010,65,PS
7,99.0,13.8,59.0,5.0,8,2010,59,C
8,19.0,20.1,61.0,5.0,9,2010,61,PS
9,194.0,8.6,69.0,5.0,10,2010,69,S


In [63]:
del Weather_Data['Temp C']

In [64]:
Weather_Data

Unnamed: 0,Solar.R,Wind,Month,Day,Year,Temp,Weather
0,190.0,7.4,5.0,1,2010,67,S
1,118.0,8.0,5.0,2,2010,72,C
2,149.0,12.6,5.0,3,2010,74,PS
3,313.0,11.5,5.0,4,2010,62,S
4,197.0,14.3,5.0,5,2010,56,S
...,...,...,...,...,...,...,...
153,190.0,7.4,5.0,1,2010,67,C
154,193.0,6.9,9.0,26,2010,70,PS
155,145.0,13.2,9.0,27,2010,77,S
156,191.0,14.3,9.0,28,2010,75,S



#### **B. Data Transformation**

 2 Data Transformation Techniques:
 
 * If data is CONTINUOUS = Standard Scaler, MinMaxScaler, Robust Scaler
 * If data is DISCRETE   = Label Encoder, One hot Encoder

#### Label Encoder

In [65]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Weather_Data['le_Weather'] = le.fit_transform(Weather_Data['Weather'])
Weather_Data

Unnamed: 0,Solar.R,Wind,Month,Day,Year,Temp,Weather,le_Weather
0,190.0,7.4,5.0,1,2010,67,S,2
1,118.0,8.0,5.0,2,2010,72,C,0
2,149.0,12.6,5.0,3,2010,74,PS,1
3,313.0,11.5,5.0,4,2010,62,S,2
4,197.0,14.3,5.0,5,2010,56,S,2
...,...,...,...,...,...,...,...,...
153,190.0,7.4,5.0,1,2010,67,C,0
154,193.0,6.9,9.0,26,2010,70,PS,1
155,145.0,13.2,9.0,27,2010,77,S,2
156,191.0,14.3,9.0,28,2010,75,S,2


In [66]:
Weather_Data_copy = Weather_Data.copy()
Weather_Data_copy

Unnamed: 0,Solar.R,Wind,Month,Day,Year,Temp,Weather,le_Weather
0,190.0,7.4,5.0,1,2010,67,S,2
1,118.0,8.0,5.0,2,2010,72,C,0
2,149.0,12.6,5.0,3,2010,74,PS,1
3,313.0,11.5,5.0,4,2010,62,S,2
4,197.0,14.3,5.0,5,2010,56,S,2
...,...,...,...,...,...,...,...,...
153,190.0,7.4,5.0,1,2010,67,C,0
154,193.0,6.9,9.0,26,2010,70,PS,1
155,145.0,13.2,9.0,27,2010,77,S,2
156,191.0,14.3,9.0,28,2010,75,S,2


#### One Hot Encoding

* Using Pandas - pd.get_dummies()
* Using sklearn - OneHotEncoder

#### Pandas - One Hot Encoding

In [67]:
Weather_Data_copy = pd.get_dummies(data = Weather_Data_copy,columns=['Weather'])

In [68]:
Weather_Data_copy

Unnamed: 0,Solar.R,Wind,Month,Day,Year,Temp,le_Weather,Weather_C,Weather_PS,Weather_S
0,190.0,7.4,5.0,1,2010,67,2,0,0,1
1,118.0,8.0,5.0,2,2010,72,0,1,0,0
2,149.0,12.6,5.0,3,2010,74,1,0,1,0
3,313.0,11.5,5.0,4,2010,62,2,0,0,1
4,197.0,14.3,5.0,5,2010,56,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...
153,190.0,7.4,5.0,1,2010,67,0,1,0,0
154,193.0,6.9,9.0,26,2010,70,1,0,1,0
155,145.0,13.2,9.0,27,2010,77,2,0,0,1
156,191.0,14.3,9.0,28,2010,75,2,0,0,1


#### Using sklearn - OneHotEncoder

In [118]:
Weather_Data_copy2 = Weather_Data.copy()
Weather_Data_copy2
del Weather_Data_copy2['le_Weather']

In [119]:
Weather_Data_copy2

Unnamed: 0,Solar.R,Wind,Month,Day,Year,Temp,Weather
0,190.0,7.4,5.0,1,2010,67,S
1,118.0,8.0,5.0,2,2010,72,C
2,149.0,12.6,5.0,3,2010,74,PS
3,313.0,11.5,5.0,4,2010,62,S
4,197.0,14.3,5.0,5,2010,56,S
...,...,...,...,...,...,...,...
153,190.0,7.4,5.0,1,2010,67,C
154,193.0,6.9,9.0,26,2010,70,PS
155,145.0,13.2,9.0,27,2010,77,S
156,191.0,14.3,9.0,28,2010,75,S


In [120]:
x = Weather_Data_copy2.drop(labels='Weather', axis=1)
y = Weather_Data_copy2[['Weather']]

In [124]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
Ohe_Y = ohe.fit_transform(y).toarray()

In [125]:
Ohe_Y

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0