# <font color = brown><h1 align= 'center'> Preprocessing of the Data</h1></font>

# <font color = "magenta"><h4 align = "left">Steps involved in data pre-processing : </h4></font>

## 1. Importing Libraries
## 2. Importing the dataset
## 3. Missing Values
## 4. Categorical Data
## 5. Splitting Data
## 6. Feature Scaling

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [41]:
## Importing the Dataset
data = pd.read_csv("http://bit.ly/movieusers")
data.head(2)

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067


In [4]:
## Using seperator as "|" to make the data  in a proper structure
data = pd.read_table("http://bit.ly/movieusers", sep= "|")
data.head(2)

Unnamed: 0,1,24,M,technician,85711
0,2,53,F,other,94043
1,3,23,M,writer,32067


In [5]:
# To remove the row data as a header
data = pd.read_table("http://bit.ly/movieusers", sep= "|", header=None)
data.head(2)

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043


In [30]:
# Reading the dataset
df = pd.read_csv("https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [45]:
# Checking the datatype of the day column
df.day.dtype # Object(text kind of data)

dtype('O')

In [47]:
type(df.day[0])

str

In [8]:
# Checking the type of the day column's value
type(df.day[0])

str

In [9]:
df.columns.value_counts()

day            1
temperature    1
windspeed      1
event          1
dtype: int64

In [10]:
# Checking the occurances(or) frequency of the values in a particular column
df['temperature'].value_counts()  # Mode

-99999    2
 32       2
 31       1
 28       1
 34       1
Name: temperature, dtype: int64

In [48]:
# Checking for duplicate values
df[df.duplicated()]

Unnamed: 0,day,temperature,windspeed,event


In [12]:
# Checking for duplicates specific temperature column
df[df['temperature'].duplicated()]

Unnamed: 0,day,temperature,windspeed,event
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain


In [13]:
df[df['event'].duplicated()]

Unnamed: 0,day,temperature,windspeed,event
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [14]:
df['event'].value_counts()

Rain     2
Sunny    2
0        2
Snow     1
Name: event, dtype: int64

In [33]:
import pandas as pd
# Using parse_dates for the day column to convert the data type of the day column to datetime
df = pd.read_csv("https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/weather_data.csv", 
                 parse_dates =['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [34]:
df.temperature.dtype

dtype('int64')

In [35]:
df.day.dtype

dtype('<M8[ns]')

In [36]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

## Handling the Missing Values:
- Data can have missing values for a number of reasons such as observations that were not recorded and data corruption.
- Handling missing data is important as many machine learning algorithms do not support data with missing values.


In [37]:
# replacing the particular values of different columns with NaN
new_df = df.replace({'temperature': -99999, 'windspeed': -99999, 'event': '0'}, np.nan)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [38]:
# Checking the total number of missing values
new_df.isnull().sum()

day            0
temperature    2
windspeed      2
event          2
dtype: int64

In [39]:
# fillna is used to fill the missing values
data = new_df.fillna(0)
data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,0.0,7.0,Sunny
2,2017-01-03,28.0,0.0,Snow
3,2017-01-04,0.0,7.0,0
4,2017-01-05,32.0,0.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,0


In [40]:
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [41]:
#ffill --> forward fill; To fill the values in a column with the above row's value
f_data = new_df.fillna(method='ffill')
f_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,28.0,7.0,Snow
4,2017-01-05,32.0,7.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [42]:
#bfill --> backward fill; To fill the values in a column with the below row's value
b_data = new_df.fillna(method='bfill')
b_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,28.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,32.0,7.0,Rain
4,2017-01-05,32.0,2.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [43]:
new_df.mean()

temperature    31.4
windspeed       5.4
dtype: float64

##### Imputation: Imputation is the method in which the missing value of a certain variable is replaced by a certain value of the available cases.

In [44]:
## Mean Imputation
mean_data = new_df.fillna(new_df.mean()) # fill the missing values in a column with the column's mean value
mean_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,31.4,7.0,Sunny
2,2017-01-03,28.0,5.4,Snow
3,2017-01-04,31.4,7.0,
4,2017-01-05,32.0,5.4,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [45]:
## Median Imputation: 

In [46]:
new_df.median()

temperature    32.0
windspeed       6.0
dtype: float64

In [47]:
## Median Imputation: filling the missing values in a column with the corresponding median value of a column
median_data = new_df.fillna(new_df.median())
median_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,6.0,Snow
3,2017-01-04,32.0,7.0,
4,2017-01-05,32.0,6.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [48]:
# Calculating the Mode for Event Column
new_df.event.mode().iloc[0]

'Rain'

In [49]:
# Filling the missing values in the event column with Mode of that particular column
new_df['event'] = new_df['event'].fillna(new_df.event.mode().iloc[0])
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,Rain
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Rain


In [50]:
event_data = new_df.fillna(new_df.event.mode().iloc[0])
print(event_data)

         day temperature windspeed  event
0 2017-01-01          32         6   Rain
1 2017-01-02        Rain         7  Sunny
2 2017-01-03          28      Rain   Snow
3 2017-01-04        Rain         7   Rain
4 2017-01-05          32      Rain   Rain
5 2017-01-06          31         2  Sunny
6 2017-01-06          34         5   Rain


### interpolate(): Based on the column values, interpolate function will come up with a guess value to be filled in NaN

In [51]:
int_data = new_df.interpolate()
int_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,30.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,30.0,7.0,Rain
4,2017-01-05,32.0,4.5,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Rain


### dropna(): Drops the columns or rows which have missing values

In [52]:
drop_data = new_df.dropna()
drop_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Rain


In [53]:
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,Rain
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Rain


In [54]:
## drop the columns having any value
drop_data = new_df.dropna(how='any', axis=1)
drop_data

Unnamed: 0,day,event
0,2017-01-01,Rain
1,2017-01-02,Sunny
2,2017-01-03,Snow
3,2017-01-04,Rain
4,2017-01-05,Rain
5,2017-01-06,Sunny
6,2017-01-06,Rain


In [55]:
# how = all, drops the rows or columns which has all the missing values
drop_all_data = new_df.dropna(how='all')
drop_all_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,Rain
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Rain


### As Imputer object works only on the numerical values, for the categorical data we deal with mode 

### Handling Missing Values using Scikit-Learn

In [56]:
### SimpleImputer: All occurrences of missing_values will be imputed. 
## It is a two-way transformation--> fit and transform.
### First we need to fit it to the data and then transform the data for the columns

In [57]:
## creating an object for the SimpleImputer class
import numpy as np
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

In [58]:
## It is a 2 step transformation, applying the fit method on the data
imputer.fit(new_df.iloc[:,1:3])  # indices of column1 and column 2

SimpleImputer()

In [59]:
new_df.iloc[:, 1:3] = imputer.transform(new_df.iloc[:, 1:3])
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,31.4,7.0,Sunny
2,2017-01-03,28.0,5.4,Snow
3,2017-01-04,31.4,7.0,Rain
4,2017-01-05,32.0,5.4,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Rain
