In [1]:
# train file path
train_file_path = r"C:\Users\Amit\Documents\projects\ml project\ML_project\housing\artifact\data_ingestion\2022-08-23-18-22-28\ingested_data\train\housing.csv"

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv(train_file_path)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [None]:
df.shape

In [None]:
# target column
df['median_house_value'].head()

In [5]:
# isolating features and target
features,target =df.drop(['median_house_value'],axis=1),df[['median_house_value']]

In [None]:
features.shape

In [None]:
target.shape

In [6]:
features.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN


In [7]:
target.head()

Unnamed: 0,median_house_value
0,72100.0
1,279600.0
2,82700.0
3,112500.0
4,238300.0


In [None]:
# null check
features.isna().sum()

In [None]:
plt.boxplot(df['total_bedrooms'].value_counts())
plt.show()

In [None]:
df['total_bedrooms'].value_counts()

In [10]:
# creating custom transformers

class my_transformer:
    
    def __init__(self, strategy="median"):
        self.strategy = strategy
    
    def fit(self,X):
        self.features_ = X.columns
        self.statistics_ = []
        for column in X.columns:
            self.statistics_.append(X[column].median())
            
    def transform(self,X:pd.DataFrame):
        for idx,column in enumerate(features.columns):
            X[column].fillna(self.statistics_[idx])
        return X
    
    def fit_transform(self,X):
        self.fit(X)
        return self.transform(X)
    

In [11]:
for idx,columns in enumerate(features.columns):
    print(idx,columns)

0 longitude
1 latitude
2 housing_median_age
3 total_rooms
4 total_bedrooms
5 population
6 households
7 median_income
8 ocean_proximity


In [None]:
# libraries to create custom transformers
# check sklearn-github/sklearn/base.py to see details

from sklearn.base import BaseEstimator, TransformerMixin

{class custom_transformer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy = "median")
    
    ...........
    .........
    .......
    ....
    ..
    }

In [12]:
################### STANDARD WAY OF CREATING FEATURE ENGINEERING PIPELINE ###################

# since we apply different transformations on numerical and categorical features, so we create different pipelines for both type of features and after all transformations are applied,
# we then combine both pipelines.
# to create transformation pipeline we use sklearn.pipeline.Pipeline
# to combine transformations we use sklearn.compose.ColumnTransformer

from xml.etree.ElementTree import PI
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

numerical_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="median")),
    ('feature_generator',my_transformer()),
    ('scaling',StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ('onehotencoding',OneHotEncoder()),
    ('scaling',StandardScaler(with_mean=False))
])

# creating column name list for num and cat features

numerical_col_name = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income']


categorical_col_name = ['ocean_proximity']

# now creating preprocessing object, ColumnTransformer requires 3 inputs -> 'name',pipeline_name,feature_column_names

preprocessing = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_col_name),
    ('categorical_pipeline',categorical_pipeline,categorical_col_name)])

# now this preprocessing object will be pickled.

In [13]:
preprocessing

ColumnTransformer(transformers=[('numerical_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('feature_generator',
                                                  <__main__.my_transformer object at 0x0000025567AD6F60>),
                                                 ('scaling',
                                                  StandardScaler())]),
                                 ['longitude', 'latitude', 'housing_median_age',
                                  'total_rooms', 'total_bedrooms', 'population',
                                  'households', 'median_income']),
                                ('categorical_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 (

In [None]:
# preprocessing.fit_transform(DataFrame) to get complete transformations
# just 1 line of code to do complete transformations.

####################### preprocessing.fit_transform(df) not working here because i didnt update the my_transformer class. #########################    


In [14]:
# to save preprocessing object in a pickle file
import dill

with open('preprocessing.pkl','wb') as prep_file:
    dill.dump(preprocessing,prep_file)

In [None]:
# loading saved pickle file

with open('preprocessing.pkl','rb') as file_obj:
    preprocessing_loaded_obj = dill.load(file_obj)

# then preprocessing_loaded_obj.transform(DataFrame) to do transforamations

In [53]:
schema_file_path=r"C:\Users\Amit\Documents\projects\ml project\ML_project\config\schema.yaml"
train_file_path=r"C:\Users\Amit\Documents\projects\ml project\ML_project\housing\artifact\data_ingestion\2022-08-25-19-46-46\ingested_data\train\housing.csv"
test_file_path=r"C:\Users\Amit\Documents\projects\ml project\ML_project\housing\artifact\data_ingestion\2022-08-25-19-46-46\ingested_data\test\housing.csv"

In [54]:
from housing.util.util import load_data,read_yaml_file
from housing.constants import *

train_df = load_data(file_path=train_file_path,schema_file_path=schema_file_path)
train_df.shape

(16512, 10)

In [55]:
test_df = load_data(file_path=test_file_path,schema_file_path=schema_file_path)
test_df.shape

(4128, 10)

In [56]:
schema = read_yaml_file(file_path=schema_file_path)
schema

{'columns': {'longitude': 'float',
  'latitude': 'float',
  'housing_median_age': 'float',
  'total_rooms': 'float',
  'total_bedrooms': 'float',
  'population': 'float',
  'households': 'float',
  'median_income': 'float',
  'median_house_value': 'float',
  'ocean_proximity': 'category'},
 'numerical_columns': '-longitude -latitude -housing_median_age -total_rooms -total_bedrooms -population -households -median_income',
 'categorical_columns': ['ocean_proximity'],
 'target_column': 'median_house_value',
 'domain_value': {'ocean_proximity': ['<1H OCEAN',
   'INLAND',
   'ISLAND',
   'NEAR BAY',
   'NEAR OCEAN']}}

In [57]:
target_column_name = schema[TARGET_COLUMN_KEY]

In [58]:
target_column_name

'median_house_value'

In [59]:
input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
input_feature_train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN


In [60]:
target_feature_train_df = train_df[target_column_name]
target_feature_train_df.head()

0     72100.0
1    279600.0
2     82700.0
3    112500.0
4    238300.0
Name: median_house_value, dtype: float64

In [61]:
input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
input_feature_test_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,<1H OCEAN
1,-120.42,34.89,24.0,2020.0,307.0,855.0,283.0,5.0099,<1H OCEAN
2,-118.45,34.25,36.0,1453.0,270.0,808.0,275.0,4.3839,<1H OCEAN
3,-118.1,33.91,35.0,1653.0,325.0,1072.0,301.0,3.2708,<1H OCEAN
4,-117.07,32.77,38.0,3779.0,614.0,1495.0,614.0,4.3529,NEAR OCEAN


In [62]:
target_feature_test_df = test_df[target_column_name]
target_feature_test_df.head()

0    500001.0
1    162500.0
2    204600.0
3    159700.0
4    184000.0
Name: median_house_value, dtype: float64

In [63]:
from housing.component.data_transformation import FeatureGenerator
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

dataset_schema = read_yaml_file(file_path=schema_file_path)

numerical_columns = dataset_schema[NUMERICAL_COLUMN_KEY]
categorical_columns = dataset_schema[CATEGORICAL_COLUMN_KEY]

numerical_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="median")),
    ('feature_generator',FeatureGenerator(
        columns=numerical_columns
    )),
    ('scaler',StandardScaler())
]
)

categorical_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ('onehotencoding',OneHotEncoder()),
    ('scaler',StandardScaler(with_mean=False))
]           
)
preprocessing = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline, numerical_columns),
    ('categorical_pipeline',categorical_pipeline, categorical_columns),
])
preprocessing

ColumnTransformer(transformers=[('numerical_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('feature_generator',
                                                  FeatureGenerator(columns='-longitude '
                                                                           '-latitude '
                                                                           '-housing_median_age '
                                                                           '-total_rooms '
                                                                           '-total_bedrooms '
                                                                           '-population '
                                                                           '-households '
                                                                           '-median_inco

In [64]:
preprocessing.fit_transform(input_feature_train_df)

ValueError: A given column is not a column of the dataframe

In [None]:
input_feature_train_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
4,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
16507,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN
16508,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND
16509,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN
16510,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN
