Hi! In this notebook, we will apply algorithms to remove outliers from the data to train a more efficient model.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(3)

%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
plotBlue = sns.color_palette()[0]
mpl.rcParams.update({'font.size': 14})
np.random.seed(3)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import iqr

# Generate data

let's generate data with some outliers

In [None]:
size = 50
sigma = 5
space = np.linspace(1, 100, size)
x = space + np.random.normal(0, sigma, size)
y = (space + np.random.normal(0, sigma, size)) ** 2 + 10

# anomaly indecies
rnd_idxs = np.random.choice(np.arange(size), size // 15, replace=False)
y[rnd_idxs] -= np.random.normal(10000, sigma * 10, len(rnd_idxs))

# add extra outliers
x[-1] = 120
y[-1] = -10000
if size - 1 not in rnd_idxs:
    rnd_idxs = list(rnd_idxs)
    rnd_idxs.append(size - 1)
    rnd_idxs = np.array(rnd_idxs)
    
x[-2] = 200
y[-2] = 0
if size - 2 not in rnd_idxs:
    rnd_idxs = list(rnd_idxs)
    rnd_idxs.append(size - 2)
    rnd_idxs = np.array(rnd_idxs)
    
rnd_idxs.sort()

plt.figure(figsize=(10,6))
plt.scatter(x,y)
plt.title('generated data')
plt.show()

In [None]:
f, axes = plt.subplots(nrows=2, figsize=(10,6))
axes[0].hist(x, bins=50)
axes[0].set_title('x hist')
axes[1].hist(y, bins=50)
axes[1].set_title('y hist')
plt.show()

We can see our outliers in the histograms. We will remove these outliers.

# LR

In [None]:
lr = LinearRegression()
lr.fit(x.reshape(-1, 1), y)
lr_mean_error = mean_squared_error(lr.predict(x.reshape(-1, 1)), y)

In [None]:
plot_space = np.linspace(1, 200, size)
plt.figure(figsize=(10,6))
plt.scatter(x, y, label='data')
plt.plot(plot_space, lr.predict(plot_space.reshape(-1, 1)), color='red', label='LR')
plt.legend()
plt.title('regression')
plt.show()

We see a very strong slope of the straight line obtained with regression - the outliers do not allow us to get a model with a good generalization.

# Task (1/2 points): detect outliers using three-sigma rule

In [None]:
# find anomaly indices using the three-sigma rule (separately for x and separately for y)

y_std = # your code
y_mean = # your code
x_std = # your code
x_mean = # your code

x_outlier_idxs = # your code
y_outlier_idxs = # your code

In [None]:
outlier_idxs = sorted(list(set(list(y_outlier_idxs) + list(x_outlier_idxs))))
print('true outlier idxs:       ', list(rnd_idxs))
print('x outlier idxs:          ', list(x_outlier_idxs))
print('y outlier idxs:          ', list(y_outlier_idxs))
print('all founded outlier idxs:', list(outlier_idxs))

In [None]:
outlier_idxs = sorted(list(set(list(y_outlier_idxs) + list(x_outlier_idxs))))
assert outlier_idxs == rnd_idxs.tolist()

# Task (1/2 points): detect outliers using IQR

In [None]:
# find anomaly indices using the IQR (separately for x and separately for y)
# please do not use anything other than numpy from the external libraries

x_iqr = # your code
x_q1 = # your code
x_q3 = # your code

y_iqr = # your code
y_q1 = # your code
y_q3 = # your code

assert iqr(x) == x_iqr 
assert iqr(y) == y_iqr 

x_outlier_idxs = # your code
y_outlier_idxs = # your code

In [None]:
outlier_idxs = sorted(list(set(list(y_outlier_idxs) + list(x_outlier_idxs))))
print('true outlier idxs:       ', list(rnd_idxs))
print('x outlier idxs:          ', list(x_outlier_idxs))
print('y outlier idxs:          ', list(y_outlier_idxs))
print('all founded outlier idxs:', list(outlier_idxs))

In [None]:
outlier_idxs = sorted(list(set(list(y_outlier_idxs) + list(x_outlier_idxs))))
assert outlier_idxs == rnd_idxs.tolist()

# LR without outliers

In [None]:
mask = np.ones(y.size, dtype=bool)
mask[outlier_idxs] = False
y_cleaned = y[mask]

In [None]:
lr_cleaned = LinearRegression()
lr_cleaned.fit(x.reshape(-1, 1)[mask], y[mask])
lr_clean_mean_error = mean_squared_error(lr_cleaned.predict(x.reshape(-1, 1)[mask]), y[mask])

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(x, y, label='data')
plt.plot(plot_space, lr.predict(space.reshape(-1, 1)), color='red', label='LR')
plt.plot(plot_space, lr_cleaned.predict(plot_space.reshape(-1, 1)), color='green', label='LR cleaned')
plt.legend()
plt.title('regression')
plt.show()

By detecting and removing the outliers, we have an excellent regression model that is able to generalize the data. Congrats! 🎉