<a href="https://colab.research.google.com/github/MingzheHu-Duke/Note_to_product_HousePricePrediction/blob/main/ConvertFunctionalCodetoPipe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part A: Basic Pipeline codes

In [6]:
import pandas as pd
data = pd.read_csv("/content/sonar.all-data.csv")


# Separate training and validation dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
X = data.values[:, 0:60]
Y = data.values[:, 60]
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)


# Build Pipelines - Import neccessary libraries


# 1. Single pipeline for prediction
pipe = Pipeline([
                 ("LR", LogisticRegression())
])

pipe.fit(X_train, Y_train)
pred = pipe.predict(X_val)
print(accuracy_score(Y_val, pred))


# 2 Single Pipeline with data scaling
pipe = Pipeline([
                 ("Scaler", StandardScaler()),
                 ("LR", LogisticRegression())
])
pipe.fit(X_train, Y_train)
pred = pipe.predict(X_val)
print(accuracy_score(Y_val, pred))

0.7619047619047619
0.7857142857142857


# Preprocessor.py

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import config


# Numerical Imputer
class NumericalImputer(BaseEstimator, TransformerMixin):
  """Numerical Data Missing Value Imputer"""
  def __init__(self, variables=None):
    self.variables = variables

  def fit(self, X, y=None):
    self.imputer_dict_ = {}
    for feature in self.variables:
      self.imputer_dict_[feature] = X[feature].mode()[0]
    return self

  def transform(self, X):
    X = X.copy()
    for feature in self.variables:
      X[feature].fillna(self.imputer_dict_[feature], inplace=True)
    return X


#Categorical Imputer
# def categorical_imputer(_data, CATEGORICAL_FEATURES):
#     for var in CATEGORICAL_FEATURES:
#         _data[var].fillna(_data[var].mode()[0], inplace=True)   
#     return _data
class CategoricalImputer(BaseEstimator,TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables
    
    def fit(self, X,y=None):
        self.imputer_dict_={}
        for feature in self.variables:
            self.imputer_dict_[feature] = X[feature].mode()[0]
        return self
    
    def transform(self, X):
        X=X.copy()
        for feature in self.variables:
            X[feature].fillna(self.imputer_dict_[feature],inplace=True)
        return X



#Rare label Categorical Encoder
# def rare_label_cat_imputer(_data, FEATURES_TO_ENCODE):
#     encoder_dict_ = {}
#     tol=0.05
    
#     for var in FEATURES_TO_ENCODE:
#         # the encoder will learn the most frequent categories
#         t = pd.Series(_data[var].value_counts() / np.float(len(_data)))
#         # frequent labels:
#         encoder_dict_[var] = list(t[t >= tol].index)
        
#     for var in FEATURES_TO_ENCODE:
#         _data[var] = np.where(_data[var].isin(
#                     encoder_dict_[var]), _data[var], 'Rare')
    
#     return _data

class RareLabelCategoricalImputer(BaseEstimator,TransformerMixin):
    def __init__(self, tol=0.05, variables=None):
        self.tol=tol
        self.variables=variables
    
    def fit(self, X, y=None):
        self.encoder_dict_={}
        for var in self.variables:
            # the encoder will learn the most frequent categories
            t = pd.Series(X[var].value_counts() / np.float(len(X)))
            # frequent labels:
            self.encoder_dict_[var] = list(t[t >= self.tol].index)
        return self

    def transform(self, X):
        X=X.copy()
        for feature in self.variables:
            X[feature] = np.where(X[feature].isin(self.encoder_dict_[feature]), X[feature], 'Rare')
        return X



#Categorical Encoder
# def categorical_encoder(_data, FEATURES_TO_ENCODE):
#     encoder_dict_ ={}
#     for var in FEATURES_TO_ENCODE:
#         t = _data[var].value_counts().sort_values(ascending=True).index 
#         encoder_dict_[var] = {k:i for i,k in enumerate(t,0)}
        
#     ## Mapping using the encoder dictionary
#     for var in FEATURES_TO_ENCODE:
#         _data[var] = _data[var].map(encoder_dict_[var])
    
#     return _data
class CategoricalEncoder(BaseEstimator,TransformerMixin):
    def __init__(self, variables=None):
        self.variables=variables
    
    def fit(self, X,y):
        self.encoder_dict_ = {}
        for var in self.variables:
            t = X[var].value_counts().sort_values(ascending=True).index 
            self.encoder_dict_[var] = {k:i for i,k in enumerate(t,0)}
        return self
    
    def transform(self,X):
        X=X.copy()
        ##This part assumes that categorical encoder does not intorduce and NANs
        ##In that case, a check needs to be done and code should break
        for feature in self.variables:
            X[feature] = X[feature].map(self.encoder_dict_[feature])
        return X

# #Temporal Variables
# def temporal_transform(_data, TEMPORAL_FEATURES, TEMPORAL_COMPARISON):
#     for var in TEMPORAL_FEATURES:
#         _data[var] = _data[var]-_data[TEMPORAL_COMPARISON]
    
#     return _data

class TemporalVariableEstimator(BaseEstimator,TransformerMixin):
    def __init__(self, variables=None, reference_variable = None):
        self.variables=variables
        self.reference_variable = reference_variable
    
    def fit(self, X,y=None):
        #No need to put anything, needed for Sklearn Pipeline
        return self
    
    def transform(self, X):
        X=X.copy()
        for var in self.variables:
            X[var] = X[var]-X[self.reference_variable]
        return X 



    
# # Log Transformations
# def log_transform(_data, LOG_FEATURES):
#     for var in LOG_FEATURES:
#         _data[var] = np.log(_data[var])
#     return _data

class LogTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables
    
    def fit(self, X,y):
        return self

    ### Need to check in advance if the features are all non negative >0
    ### If yes, needs to be transformed properly
    def transform(self,X):
        X=X.copy()
        for var in self.variables:
            X[var] = np.log(X[var])
        return X


# # Drop Features
# def drop_features(_data, DROP_FEATURES):    
#     _data.drop(DROP_FEATURES, axis=1, inplace=True)
#     return _data
    
class DropFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, variables_to_drop=None):
        self.variables_to_drop = variables_to_drop
    
    def fit(self, X,y=None):
        return self 
    
    def transform(self, X):
        X=X.copy()
        X= X.drop(self.variables_to_drop, axis=1)
        return  

# pipeline.py

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import preprocessing as pp
from sklearn.linear_model import Lasso

import configparser

price_pipe = Pipeline([
        ("Numercial Imputer", pp.NumericalImputer(variables=config.NUMERICAL_FEATURES)),
        ('Categorical Imputer', pp.CategoricalImputer(variables = config.CATEGORICAL_FEATURES)),
        ('Temporal Features', pp.TemporalVariableEstimator(variables = config.TEMPORAL_FEATURES, 
        reference_variable=config.TEMPORAL_COMPARISON)),
        ('Rare Label Encoder', pp.RareLabelCategoricalImputer(variables = config.FEATURES_TO_ENCODE)),
        ('Categorical Encoder', pp.CategoricalEncoder(variables=config.FEATURES_TO_ENCODE)),
        ('Log Transform', pp.LogTransformation(variables = config.LOG_FEATURES)),
        ('Drop Features', pp.DropFeatures(variables_to_drop=config.DROP_FEATURES)),
        ("Scaler Transform", MinMaxScaler()),
        ("Linear Model", Lasso(alpha=0.005, random_state=42))

])

# MainCode.py

In [None]:
import pandas as pd
import numpy as np


import config
from data_management import load_dataset
import preprocessor as pp
import pipelinee


train = load_dataset(config.TRAIN_FILE)
test = load_dataset(config.TEST_FILE)


# Seperating Saleprice in Y
y = train[config.TARGET]
train.drop([config.TARGET],axis=1, inplce=True)


pipeline.price_pipe.fit(train[config.KEEP], y)
pipeline.price_pipe..predict(test[config.KEEP])


print("Top 10 predictions: ", pred[1:10])