## Imports

In [None]:
# basics
import pandas as pd
import numpy as np

# feature_importance
import shap

# viz
import matplotlib.pyplot as plt

# utils
import os

## Parameters

In [None]:
target = "price"

path_root = os.path.join(
    "..",
    "data"
)
path_primary = os.path.join(
    path_root, "03_primary"
)
path_model = os.path.join(
    path_root, "04_model"
)

file_path_input_data = os.path.join(
    path_primary, "data_input.csv"
)
file_path_not_outliers = os.path.join(
    path_primary, "data_not_outliers.csv"
)

## Methods

In [None]:
def detect_outliers(data, p):
    q1 = np.percentile(data, p)
    q3 = np.percentile(data, 100 - p)
    iqr = q3 - q1
    lower_limit = q1 - 1.5 * iqr
    upper_limit = q3 + 1.5 * iqr
    return lower_limit,upper_limit

## Read dataset

In [None]:
data_input = pd.read_csv(
    file_path_input_data,
    index_col = 0
)

## Outlier vizualiation

In [None]:
lower_limit, upper_limit = detect_outliers(data_input[target].values, p = 25)

In [None]:
outliers = [x for x in data_input[target].values if x < lower_limit or x > upper_limit]
non_outliers = [x for x in data_input[target].values if x >= lower_limit and x <= upper_limit]

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

axes[0].axhline(y=lower_limit, color='red', linestyle='--')
axes[0].axhline(y=upper_limit, color='red', linestyle='--')

axes[0].plot([i for i, _ in enumerate(non_outliers)],
             non_outliers,
             '*')
axes[0].plot([i for i, _ in enumerate(data_input[target].values) if data_input[target].values[i] in outliers],
             [x for x in outliers],
             '*',
             color='red',
             label="Outlier")

axes[1].hist(data_input[target].values)
axes[1].grid()
axes[0].grid()

axes[1].set_title(f"Histogram of {target}")
axes[0].set_title(f"Scatter of {target} (n_outliers = {round(100*len(outliers)/data_input.shape[0],2)} %)")
axes[0].set_ylabel(f"{target}")
axes[0].legend()

plt.show()

In [None]:
data_not_outliers = data_input[
    (data_input[target] >= lower_limit)&(data_input[target] <= upper_limit)
].reset_index(drop = True)

## Not outliers

In [None]:
outliers = [x for x in data_input[target].values if x < lower_limit or x > upper_limit]
non_outliers = [x for x in data_not_outliers[target].values if x >= lower_limit and x <= upper_limit]

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

axes[0].axhline(y=lower_limit, color='red', linestyle='--')
axes[0].axhline(y=upper_limit, color='red', linestyle='--')

axes[0].plot([i for i, _ in enumerate(non_outliers)],
             non_outliers,
             '*')

axes[1].hist(data_not_outliers[target].values)
axes[1].grid()
axes[0].grid()

axes[1].set_title(f"Histogram of {target}")
axes[0].set_title(f"Scatter of {target} (n_outliers = {round(100*len(outliers)/data_input.shape[0],2)} %)")
axes[0].set_ylabel(f"{target}")

plt.show()

## Save not outliers

In [None]:
data_not_outliers.to_csv(file_path_not_outliers)