In [35]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import make_pipeline # используем пайплайны для удобства
from sklearn.preprocessing import StandardScaler

### Создадим искуственный датасет на 3 000 наблюдений

In [19]:
samples = 3000

# площадь
square = np.random.choice(200, samples)
# код района
area = np.random.choice(15, samples)
# этаж
floor = np.random.choice(10, samples)
# цена квартиры
flat_price = square * floor * 100 + 250

data = pd.DataFrame({'flat_price': flat_price, 'square': square, 'area': area, 'floor': floor})
data.head(5)

Unnamed: 0,flat_price,square,area,floor
0,121350,173,1,7
1,86650,96,14,9
2,3550,11,5,3
3,250,76,1,0
4,14250,140,13,1


In [66]:
# Основные статистики
data.describe()

Unnamed: 0,flat_price,square,area,floor,utility
count,3000.0,3000.0,3000.0,3000.0,3000.0
mean,43903.866667,98.500333,7.018333,4.385333,686.538667
std,41603.709385,57.267459,4.336542,2.883567,416.037094
min,250.0,0.0,0.0,0.0,250.0
25%,9250.0,49.0,3.0,2.0,340.0
50%,32250.0,98.0,7.0,4.0,570.0
75%,68075.0,147.0,11.0,7.0,928.25
max,179350.0,199.0,14.0,9.0,2041.0


### Вариант 1: Обучим модель на этих данных

In [54]:
x = data[['square', 'area', 'floor']]
y = data['flat_price']
reg = LinearRegression().fit(x, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['square', 'area', 'floor']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))
print('Score: {}'.format(reg.score(x,y)))

Weights: [ 432.98084951  -48.49078484 9847.03265095]
Bias: -41587.08736397824
Error: 12292.403632883708
Score: 0.8437213502101185


In [55]:
y.median()

32250.0

### Вариант 2: Создадим новый признак, значение которого в разы больше square, area, floor и обучим модель

In [25]:
data['utility'] = data['square'] * data['floor'] + 250
data.head(5)

Unnamed: 0,flat_price,square,area,floor,utility
0,121350,173,1,7,1461
1,86650,96,14,9,1114
2,3550,11,5,3,283
3,250,76,1,0,250
4,14250,140,13,1,390


In [86]:
data.loc[(data['square'] == 30)]

Unnamed: 0,flat_price,square,area,floor,utility
302,9250,30,7,3,340
392,3250,30,10,1,280
512,6250,30,0,2,310
734,9250,30,3,3,340
1992,27250,30,4,9,520
2101,9250,30,6,3,340
2303,9250,30,3,3,340
2441,3250,30,7,1,280
2720,27250,30,11,9,520
2742,27250,30,11,9,520


In [56]:
x1 = data[['square', 'area', 'floor', 'utility']]
y1 = data['flat_price']
reg1 = LinearRegression().fit(x1, y1)
print('Weights: {}'.format(reg1.coef_))
print('Bias: {}'.format(reg1.intercept_))

pred_values_1 = reg1.predict(data[['square', 'area', 'floor', 'utility']])
print('Error: {}'.format(mean_absolute_error(pred_values_1, y1)))
print('Score: {}'.format(reg1.score(x1,y1)))

Weights: [ 5.12989074e-13 -3.76587650e-12 -1.44047968e-12  1.00000000e+02]
Bias: -24749.99999999997
Error: 2.5487679522484542e-11
Score: 1.0


In [57]:
y1.median()

32250.0

### Вариант 3: Используем StandardScaler, чтобы привести все наши x данные к одному масштабу

In [61]:
x2 = data[['square', 'area', 'floor', 'utility']]
y2 = data['flat_price']
scaler = StandardScaler()
reg2 = LinearRegression().fit(scaler.fit_transform(x2), y2)

In [63]:
print('Weights: {}'.format(reg2.coef_))
print('Bias: {}'.format(reg2.intercept_))

pred_values_2 = reg2.predict(scaler.fit_transform(x2))
print('Error: {}'.format(mean_absolute_error(pred_values_2, y2)))
print('Score: {}'.format(reg2.score(scaler.fit_transform(x2),y2)))

Weights: [ 3.97492600e-12  0.00000000e+00 -1.84172677e-11  4.15967749e+04]
Bias: 43903.866666666676
Error: 3.3551865878204506e-11
Score: 1.0


In [65]:
y2.median()

32250.0

### Вариант 4: Добавим в наш столбец с данными по площади квартиры / помещения несколько выбросов на 5000 квадратных метров. Посмотри статистики и обучим модель

In [129]:
data_new = data

In [130]:
data_new = data_new.replace(to_replace = {'square': {55:5000}})

In [132]:
data_new.loc[(data_new['square'] == 5000)]

Unnamed: 0,flat_price,square,area,floor,utility
1023,44250,5000,1,8,690
1079,250,5000,9,0,250
1441,44250,5000,12,8,690
1478,250,5000,0,0,250
1603,38750,5000,10,7,635
2249,250,5000,6,0,250


In [133]:
# Основные статистики
data_new.describe()

Unnamed: 0,flat_price,square,area,floor,utility
count,3000.0,3000.0,3000.0,3000.0,3000.0
mean,43903.866667,108.390333,7.018333,4.385333,686.538667
std,41603.709385,226.369943,4.336542,2.883567,416.037094
min,250.0,0.0,0.0,0.0,250.0
25%,9250.0,49.0,3.0,2.0,340.0
50%,32250.0,98.0,7.0,4.0,570.0
75%,68075.0,147.0,11.0,7.0,928.25
max,179350.0,5000.0,14.0,9.0,2041.0


In [134]:
x3 = data_new[['square', 'area', 'floor', 'utility']]
y3 = data_new['flat_price']
reg3 = LinearRegression().fit(x3, y3)
print('Weights: {}'.format(reg3.coef_))
print('Bias: {}'.format(reg3.intercept_))

pred_values_3 = reg3.predict(data_new[['square', 'area', 'floor', 'utility']])
print('Error: {}'.format(mean_absolute_error(pred_values_3, y3)))
print('Score: {}'.format(reg3.score(x3,y3)))

Weights: [-2.00673323e-14 -3.55271368e-13 -2.56177805e-12  1.00000000e+02]
Bias: -24749.999999999956
Error: 1.903633043790857e-11
Score: 1.0


### Вариант 5: Применим к новому датафрейму с выбросами Standart Scaler

In [135]:
x4 = data_new[['square', 'area', 'floor', 'utility']]
y4 = data_new['flat_price']
scaler = StandardScaler()
reg4 = LinearRegression().fit(scaler.fit_transform(x4), y4)

In [136]:
print('Weights: {}'.format(reg4.coef_))
print('Bias: {}'.format(reg4.intercept_))

pred_values_4 = reg4.predict(scaler.fit_transform(x4))
print('Error: {}'.format(mean_absolute_error(pred_values_4, y4)))
print('Score: {}'.format(reg4.score(scaler.fit_transform(x4),y4)))

Weights: [-1.06767277e-11 -7.27595761e-12 -7.67386155e-12  4.15967749e+04]
Bias: 43903.866666666676
Error: 2.3572890010351936e-11
Score: 1.0


#### Данный вариант похож на вариант 3. Те же характеристики по параметрам перед признаками, ошибкой.