In [4]:
import os

import numpy as np
import pandas as pd

import joblib

from typing import Dict

import yaml
import json

import warnings
warnings.filterwarnings("ignore")

In [11]:
config_path = '../config/params_evaluate.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']


# check columns with train
column_sequence_path = preproc['unique_values_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)

# Import

In [12]:
data_test = pd.read_csv(evaluate['predict_path'])
data_test[:4]

Unnamed: 0,city,postalCode,adType,propertyType,accountType,surfaceArea,roomsQuantity,bedroomsQuantity,floorQuantity,floor,isInCondominium,priceHasDecreased,district.libelle,district.id
0,Paris 14e,75014,buy,flat,agency,67.0,4.0,1.0,7.0,4.0,True,False,Montsouris - Dareau,100493
1,Paris 20e,75020,buy,flat,agency,45.0,2.0,1.0,11.0,3.0,True,False,Père Lachaise - Réunion,100446
2,Paris 20e,75020,buy,flat,agency,54.0,2.0,1.0,4.0,3.0,True,True,Ménilmontant - Amandiers,100447
3,Paris 18e,75018,buy,flat,agency,41.0,2.0,1.0,6.0,3.0,True,False,Grandes Carrières - Clichy,100466


In [7]:
data_test.shape

(497, 14)

# Preprocessing

In [13]:
def change_to_numerical(data: pd.DataFrame, numerical_columns: list) -> None:
    """
    This function changes values in given columns to numerical and sets invalid parsing as NaN
    :param data: data frame
    :param numerical_columns: list with columns to to be converted
    """
    data[numerical_columns] = data[numerical_columns].apply(pd.to_numeric,
                                                            errors='coerce')


def fillna_with_mode(data: pd.DataFrame,
                     fillna_with_mode_columns: list) -> None:
    """
    This function fills in NaN values in given columns with mode
    :param data: data frame
    :param fillna_with_mode_columns: list with columns to to be converted
    """
    for elem in fillna_with_mode_columns:
        data[elem] = data[elem].fillna(data[elem].mode()[0])


def fillna_groupby_category_mode(data: pd.DataFrame, column_to_change: list,
                                 column_to_groupby: list) -> None:
    """
    This function fills empty values in a column with the mode received groupping data on another column
    :param data: data frame
    :param column_to_change: the list of columns we fill empty values in
    :param column_to_groupby: the list of columns by which we group
    """

    for elem_ch, elem_gb in zip(column_to_change, column_to_groupby):
        data[elem_ch] = data.groupby(elem_gb)[elem_ch].transform(
            lambda x: x.fillna(x.mode()[0]))


def transform_types(data: pd.DataFrame,
                    change_type_columns: dict) -> pd.DataFrame:
    """
    Converting features to a given data type
    :param data: data frame
    :param change_type_columns: dictionary with features and data types
    :return: data frame with changed types
    """
    return data.astype(change_type_columns, errors="raise")


def get_bins(data: (int, float), first_val: (int, float),
             second_val: (int, float)) -> str:
    """
    Bins creation for different features
    :param data: dataset
    :param first_val: first value threshold for binning
    :param second_val: second value threshold for binning
    :return: dataset
    """
    assert isinstance(data, (int, float)), "Problem with data type in feature"
    result = ("small" if data <= first_val else
              "medium" if first_val < data <= second_val else "large")
    return result


def check_columns_evaluate(data: pd.DataFrame,
                           unique_values_path: str) -> pd.DataFrame:
    """
    Checking for train features fullness and ordering features according to train
    :param data: test dataset
    :param unique_values_path: path to the list with train features for comparison
    :return: test dataset
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    column_sequence = unique_values.keys()

    assert set(column_sequence) == set(data.columns), "Different features"
    return data[column_sequence]

In [14]:
def pipeline_preprocess(data: pd.DataFrame, flg_evaluate: bool = True, **kwargs):
    """
    Preprocessing pipeline
    :param data: dataset
    :param flg_evaluate: evaluate flag
    :return: dataset
    """    
    # drop columns    
    data = data.drop(kwargs['drop_columns'], axis=1, errors="ignore")
    data = data.drop(kwargs['empty_columns'], axis=1, errors="ignore")
    
    # checking the dataset for a match with features from train
    # or saving unique data with features from train
    if flg_evaluate:
        data = check_columns_evaluate(
            data=data, unique_values_path=kwargs["unique_values_path"]
        )
    else:
        save_unique_train_data(
            data=data,
            drop_columns=kwargs["drop_columns"],
            emty_columns=kwargs["empty_columns"],
            target_column=kwargs["target_column"],
            unique_values_path=kwargs["unique_values_path"],
        )

    #change columns from object to numerical
    change_to_numerical(data=data, numerical_columns=kwargs['numerical_columns_evaluate'])
    
    # fill empty va;ues with mode
    fillna_with_mode(data=data,
                 fillna_with_mode_columns=kwargs['fillna_with_mode_columns'])
    
    # fill empty values in a column with the mode received groupping data on another column
    fillna_groupby_category_mode(data=data,
                             column_to_change=kwargs['column_to_change'],
                             column_to_groupby=kwargs['column_to_groupby'])
    # transform data types
    data = transform_types(data=data, change_type_columns=kwargs['change_type_columns'])
    
    assert isinstance(
        kwargs["map_bins_columns"], dict
    ), "Подайте тип данных для бинаризации в формате dict"
    # bins
    for key in kwargs["map_bins_columns"].keys():
        data[f"{key}_bins"] = data[key].apply(
            lambda x: get_bins(
                x,
                first_val=kwargs["map_bins_columns"][key][0],
                second_val=kwargs["map_bins_columns"][key][1],
            )
        )
    
    # change category types
    dict_category = {key: "category" for key in data.select_dtypes(["object"]).columns}
    data = transform_types(data=data, change_type_columns=dict_category)
    
    return data

In [15]:
data_proc_test = pipeline_preprocess(data=data_test, **preproc)

# Evaluate

In [16]:
model = joblib.load(training['model_path'])
data_proc_test['predict'] = model.predict(data_proc_test)

In [17]:
data_proc_test[:4]

Unnamed: 0,city,postalCode,adType,propertyType,accountType,surfaceArea,roomsQuantity,bedroomsQuantity,floorQuantity,floor,isInCondominium,priceHasDecreased,district.libelle,district.id,surfaceArea_bins,predict
0,Paris 14e,75014,buy,flat,agency,67.0,4.0,1.0,7.0,4.0,True,False,Montsouris - Dareau,100493,medium,690403.937222
1,Paris 20e,75020,buy,flat,agency,45.0,2.0,1.0,11.0,3.0,True,False,Père Lachaise - Réunion,100446,small,421747.062051
2,Paris 20e,75020,buy,flat,agency,54.0,2.0,1.0,4.0,3.0,True,True,Ménilmontant - Amandiers,100447,small,548237.236285
3,Paris 18e,75018,buy,flat,agency,41.0,2.0,1.0,6.0,3.0,True,False,Grandes Carrières - Clichy,100466,small,441990.502222
