In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn 
df = pd.read_csv("https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/weather_data.csv",
                 parse_dates=['day'])
data = df.replace({'temperature': -99999, 'windspeed' : -99999,'event':'0'}, np.nan)
data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [2]:
##interpolate() --> this function is used to replace the NaN values with a 'guess' value btwn max and min in the column
int_data= data.interpolate()
#works only on numerical values [Here the function itself assigns a value]
int_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,30.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,30.0,7.0,
4,2017-01-05,32.0,4.5,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


### dropna(): This functions drops the Columns/Rows which have missing values

In [3]:
drop_data=data.dropna()
drop_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
5,2017-01-06,31.0,2.0,Sunny


In [4]:
drop_data=data.dropna(how='all')
drop_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


## Handling Missing Values from Scikit Learn:

In [5]:
## Simple Imputer from Scikit Learn
from sklearn.impute import SimpleImputer
imputer= SimpleImputer(missing_values=np.nan,strategy='mean')
##missing_values --> how are the missing values[type]
##strategy --> what srategy to apply on the DataSet

In [6]:
## It is known as a Two-step Transformation
##step1-->fit ##step2-->Transform
imputer.fit(data.iloc[:,1:3]) #indices or Rows an column to iloc[]

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [7]:
data.iloc[:,1:3]=imputer.transform(data.iloc[:,1:3])
data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,31.4,7.0,Sunny
2,2017-01-03,28.0,5.4,Snow
3,2017-01-04,31.4,7.0,
4,2017-01-05,32.0,5.4,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


## Encoding Categorical Data
categorical Data Types
    - Nominal
    - Ordinal
using pandas libray --> get_dummies()

In [8]:
price=pd.read_csv(r"https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/homeprices.csv")
price

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [9]:
## Convert the categorical data into numerical columns
dummy_set=pd.get_dummies(price.town)
dummy_set

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [10]:
merged_data=pd.concat([price,dummy_set],axis=1)
merged_data

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [11]:
pd.get_dummies(price,columns=['town'])

Unnamed: 0,area,price,town_monroe township,town_robinsville,town_west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0
5,2600,585000,0,0,1
6,2800,615000,0,0,1
7,3300,650000,0,0,1
8,3600,710000,0,0,1
9,2600,575000,0,1,0


In [12]:
pd.get_dummies(price,columns=['town'],drop_first=True)

Unnamed: 0,area,price,town_robinsville,town_west windsor
0,2600,550000,0,0
1,3000,565000,0,0
2,3200,610000,0,0
3,3600,680000,0,0
4,4000,725000,0,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,1,0
