# Data Pipeline to Transform data for ML model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
import os
data_root_dir = "C:\\Users\\sachin\\OneDrive - Grras Solution Pvt. Ltd\\ML_BATCH_2021\\MachineLearning\\Notebooks\\datasets"
data_path = os.path.join(data_root_dir, 'housing\housing.csv')
housing = pd.read_csv(data_path)

### Prepare Data For ML Algorithm

    1. Scaling
    2. Impute Missing Values / Fix NA values
    3. Feature Engineering
    4. Dropping Uncessary Feature
    5. Categorical Data to Numerical Data
    6. Text -> vector
    

#### how to fix Na values in Continus Data

In [11]:
import os
data_root_dir = "C:\\Users\\sachin\\OneDrive - Grras Solution Pvt. Ltd\\ML_BATCH_2021\\MachineLearning\\Notebooks\\datasets"
data_path = os.path.join(data_root_dir, 'housing\housing.csv')
housing = pd.read_csv(data_path)
X = housing.drop('median_house_value', axis=1).copy()
y = housing['median_house_value'].copy()

In [12]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


In [13]:
X.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

#### How to deal missing values in Continous Column

    1. Drop all rows which contains missing values -> it's not a good idea
    
    2. Fix NA values with Central Tendency of Data

In [45]:
X_train_numerical = X_train.drop(['ocean_proximity'], axis=1)
X_train_category  = X_train[['ocean_proximity']]

X_test_numerical = X_test.drop(['ocean_proximity'], axis=1)
X_test_category  = X_test[['ocean_proximity']]

In [46]:
X_train_numerical.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
9950,-122.33,38.38,28.0,1020.0,169.0,504.0,164.0,4.5694
3547,-118.6,34.26,18.0,6154.0,1070.0,3010.0,1034.0,5.6392
4448,-118.21,34.07,47.0,1346.0,383.0,1452.0,371.0,1.7292
6984,-118.02,33.96,36.0,2071.0,398.0,988.0,404.0,4.6226
4432,-118.2,34.08,49.0,1320.0,309.0,1405.0,328.0,2.4375


In [47]:
X_train_category.head()

Unnamed: 0,ocean_proximity
9950,INLAND
3547,<1H OCEAN
4448,<1H OCEAN
6984,<1H OCEAN
4432,<1H OCEAN


In [48]:
from sklearn.impute import SimpleImputer

In [49]:
imputer = SimpleImputer(strategy='median')

In [50]:
imputer.fit(X_train_numerical) # Learn Parameter , it will calculate median of each column you will pass

SimpleImputer(strategy='median')

In [51]:
imputer.statistics_

array([-118.49  ,   34.25  ,   29.    , 2138.5   ,  438.    , 1170.    ,
        412.    ,    3.5294])

In [135]:
X_train_numerical_tr = imputer.transform(X_train_numerical)
X_train_numerical_tr = pd.DataFrame(X_train_numerical_tr, columns=X_train_numerical.columns)

In [136]:
X_train_numerical.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        172
population              0
households              0
median_income           0
dtype: int64

In [137]:
X_train_numerical_tr.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64

In [138]:
imputer = SimpleImputer(strategy='median')

X_train_numerical_tr = imputer.fit_transform(X_train_numerical)
X_train_numerical_tr = pd.DataFrame(X_train_numerical_tr, columns=X_train_numerical.columns)

In [139]:
X_train_numerical_tr.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64

In [140]:
#cat_transform = SimpleImputer(strategy='most_frequent').fit_transform(X_train_category)

In [64]:
# Scaling
# Impute


#### How to Deal with Categorical Column


    how to convert categorical data into numerical data
    
    
        Ordinal Categories    -> LabelEncoder / OrdinalEncoder
        
        Nominal Categories    -> One Hot Encoding  / Vectorization

In [98]:
ord_cat  = pd.DataFrame({
    'performance': np.random.choice(['good', 'bad', 'avg'], 10),
    'review': np.random.choice(["excellent", "ok ok", "boring"], 10)
})

In [99]:
ord_cat['performance'] = ord_cat['performance'].map({'good': 'c', 'avg': 'b', 'bad': 'a'})

In [100]:
from sklearn.preprocessing import OrdinalEncoder

In [101]:
ordinal_encoder = OrdinalEncoder()

In [102]:
ord_cat

Unnamed: 0,performance,review
0,c,boring
1,b,ok ok
2,a,boring
3,b,ok ok
4,a,boring
5,c,excellent
6,a,ok ok
7,c,excellent
8,c,boring
9,c,boring


In [103]:
ordinal_encoder.fit(ord_cat)

OrdinalEncoder()

In [104]:
ordinal_encoder.categories_

[array(['a', 'b', 'c'], dtype=object),
 array(['boring', 'excellent', 'ok ok'], dtype=object)]

In [105]:
ordinal_encoder.transform(ord_cat)

array([[2., 0.],
       [1., 2.],
       [0., 0.],
       [1., 2.],
       [0., 0.],
       [2., 1.],
       [0., 2.],
       [2., 1.],
       [2., 0.],
       [2., 0.]])

In [78]:
ord_cat

Unnamed: 0,performance,review
0,avg,ok ok
1,bad,boring
2,good,boring
3,good,boring
4,good,boring
5,good,ok ok
6,avg,boring
7,avg,excellent
8,bad,ok ok
9,good,ok ok


In [80]:
ordinal_encoder = OrdinalEncoder()

In [81]:
X_train_category_tr = ordinal_encoder.fit_transform(X_train_category)

X_train_category_tr = pd.DataFrame(X_train_category_tr, columns=X_train_category.columns)

In [83]:
X_train_category_tr.head()

Unnamed: 0,ocean_proximity
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0


In [84]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

#### Categorical Nominal Data

In [107]:
no_cat = pd.DataFrame({
    'sex': np.random.choice(['male', 'female'], 10),
    'food': np.random.choice(['south', 'fast', 'north', 'chinese'], 10)
})

In [108]:
no_cat

Unnamed: 0,sex,food
0,female,chinese
1,male,fast
2,male,fast
3,male,south
4,female,chinese
5,male,chinese
6,female,fast
7,male,north
8,female,chinese
9,male,south


In [109]:
from sklearn.preprocessing import OneHotEncoder

In [110]:
encoder = OneHotEncoder()

In [111]:
encoder.fit(no_cat)

OneHotEncoder()

In [112]:
encoder.categories_

# female   male   chinese   fast   north  south

[array(['female', 'male'], dtype=object),
 array(['chinese', 'fast', 'north', 'south'], dtype=object)]

In [122]:
no_cat_tr = encoder.transform(no_cat)
#no_cat_tr.toarray()
no_cat_tr = pd.DataFrame(no_cat_tr.toarray(), columns=['female', 'male', 'chinese', 'fast', 'north', 'south'])

In [124]:
no_cat

Unnamed: 0,sex,food
0,female,chinese
1,male,fast
2,male,fast
3,male,south
4,female,chinese
5,male,chinese
6,female,fast
7,male,north
8,female,chinese
9,male,south


In [123]:
no_cat_tr

Unnamed: 0,female,male,chinese,fast,north,south
0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,0.0
5,0.0,1.0,1.0,0.0,0.0,0.0
6,1.0,0.0,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,0.0,1.0,0.0
8,1.0,0.0,1.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0,1.0


#### Now we know how to deal category data

In [126]:
encoder = OneHotEncoder()
X_train_category_tr = encoder.fit_transform(X_train_category)

In [127]:
X_train_category_tr
#1 -> 3, 5, 7
#2 -> 2, 1, 5
# ...
#16512

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [128]:
encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [130]:
X_train_category_tr = pd.DataFrame(X_train_category_tr.toarray(), columns=['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'])

In [131]:
X_train_category_tr.head()

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [133]:
from sklearn.preprocessing import StandardScaler

    X -> T1 -> T2 -> T3 -> X_tr

    pipeline will contains sklearn Estimator Class or Transformer Class

In [141]:
scale = StandardScaler()
X_train_numerical_tr = scale.fit_transform(X_train_numerical_tr)
X_train_numerical_tr = pd.DataFrame(X_train_numerical_tr, columns=X_train_numerical.columns)
X_train_numerical_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-1.381195,1.289422,-0.045376,-0.737644,-0.872454,-0.82291,-0.873927,0.367714
1,0.483377,-0.640792,-0.840169,1.587216,1.246544,1.394248,1.371062,0.930745
2,0.678333,-0.729807,1.46473,-0.590019,-0.369162,0.015823,-0.339775,-1.127069
3,0.773311,-0.781342,0.590458,-0.261713,-0.333885,-0.394696,-0.25462,0.395713
4,0.683331,-0.725122,1.623688,-0.601793,-0.543198,-0.02576,-0.450734,-0.754294


In [142]:
X_train_numerical_tr.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64

    Numerical -> Impute -> Scale -> X_num_tr

    Category  -> OneHotEncoding  -> X_cat_tr

In [145]:
X_train_final = pd.concat([X_train_numerical_tr, X_train_category_tr], axis=1)

In [146]:
X_train_final.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-1.381195,1.289422,-0.045376,-0.737644,-0.872454,-0.82291,-0.873927,0.367714,0.0,1.0,0.0,0.0,0.0
1,0.483377,-0.640792,-0.840169,1.587216,1.246544,1.394248,1.371062,0.930745,1.0,0.0,0.0,0.0,0.0
2,0.678333,-0.729807,1.46473,-0.590019,-0.369162,0.015823,-0.339775,-1.127069,1.0,0.0,0.0,0.0,0.0
3,0.773311,-0.781342,0.590458,-0.261713,-0.333885,-0.394696,-0.25462,0.395713,1.0,0.0,0.0,0.0,0.0
4,0.683331,-0.725122,1.623688,-0.601793,-0.543198,-0.02576,-0.450734,-0.754294,1.0,0.0,0.0,0.0,0.0


### Custom Transformer 

    Column Adder
    
    
### How to Tune Models

### How to Deploy Models

    text -> NLP -> number

##### Final Pipeline 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Load Data

In [2]:
import os
data_root_dir = "C:\\Users\\sachin\\OneDrive - Grras Solution Pvt. Ltd\\ML_BATCH_2021\\MachineLearning\\Notebooks\\datasets"
data_path = os.path.join(data_root_dir, 'housing\housing.csv')
housing = pd.read_csv(data_path)
X = housing.drop('median_house_value', axis=1).copy()
y = housing['median_house_value'].copy()