In [200]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Загрузка данных

In [201]:
%%capture
!wget https://www.dropbox.com/s/64ol9q9ssggz6f1/data_ford_price.xlsx

In [202]:
data = pd.read_excel('data/data_ford_price.xlsx') 

---
#  Отбор признаков: мотивация
---

## Предобработка данных

In [203]:
data = data[['price','year', 'cylinders', 'odometer', 'lat', 'long', 'weather']]
data.dropna(inplace = True)

y = data['price']
x = data.drop(columns='price')

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

## Обучение модели

In [204]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4682.957


## Удаление избыточного признака

In [205]:
x.drop('lat', axis = 1, inplace = True)

In [206]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [207]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.2f' % mae)

MAE: 4672.93


## Метод рекурсивного исключения признаков

In [208]:
from sklearn.feature_selection import RFE

In [209]:
y = data['price']
x = data.drop(columns='price')

In [210]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [211]:
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'lat'], dtype=object)

In [212]:
X_train.columns

Index(['year', 'cylinders', 'odometer', 'lat', 'long', 'weather'], dtype='object')

In [213]:
selector.ranking_

array([1, 1, 4, 1, 3, 2])

---
##  МЕТОДЫ ВЫБОРА ПРИЗНАКОВ НА ОСНОВЕ ФИЛЬТРОВ
---

In [214]:
from sklearn.feature_selection import SelectKBest, f_regression

In [215]:
selector = SelectKBest(f_regression, k=3)
selector.fit_transform(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'odometer'], dtype=object)

---
---

Задание 9.5. Модуль ML-6 (HW-03)
Обучите модель линейной регрессии на найденных двумя способами трёх важных признаках и сравните полученные результаты. Загрузите полученный ноутбук (в формате IPYNB) в форму ниже.

КРИТЕРИИ ОЦЕНИВАНИЯ:

* 1 балл	Верно выделены три столбца-признака для обучения, выбранные RFE.
* 1 балл	Верно выделены три столбца-признака для обучения, выбранные SelectKBest.
* 3 балла	Обучена регрессия на первых трёх столбцах, оценено качество модели на тесте.
* 3 балла	Обучена регрессия на вторых трёх столбцах, оценено качество модели на тесте.
* 2 балла	Произведено сравнение выбранных метрик в форме комментария. Дан ответ на вопрос «Какой метод отбора признаков показал наилучший результат на тестовой выборке?» (в текстовой ячейке).

In [216]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [217]:
data = pd.read_excel('data/data_ford_price.xlsx')

In [218]:
data = data[['price','year', 'cylinders', 'odometer', 'lat', 'long', 'weather']]
data.dropna(inplace = True)

In [219]:
y = data['price']
x = data.drop(columns='price')

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

lr_full = LinearRegression()
lr_full = lr_full.fit(X_train, y_train)

y_test_pred = lr_full.predict(X_test)

print('Test-full MAE: {:.2f}'.format(round(mean_absolute_error(y_test, y_test_pred))))

Test-full MAE: 4683.00


In [220]:
x_motiv = x.drop('lat', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(x_motiv, y, test_size=0.3, random_state=40)

motiv_model = LinearRegression()
motiv_model.fit(X_train, y_train)
y_predicted = motiv_model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('Мотивация (MAE): %.2f' % mae)

Мотивация (MAE): 4672.93


In [221]:
from sklearn.feature_selection import RFE

estimator = LinearRegression()
rfe_selector = RFE(estimator, n_features_to_select=3, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)
 
rfe_cols = rfe_selector.get_feature_names_out()

In [222]:
y = data['price']
x = data[rfe_cols]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

rfe_model = LinearRegression()
rfe_model = rfe_model.fit(X_train, y_train)

y_test_pred = rfe_model.predict(X_test)

print('Test MAE: {:.3f}'.format(round(mean_absolute_error(y_test, y_test_pred))))

Test MAE: 5113.000


In [1]:
from sklearn.feature_selection import SelectKBest, f_regression

skb_selector = SelectKBest(f_regression, k=3)
skb_selector.fit_transform(X_train, y_train)
 
skb_cols = skb_selector.get_feature_names_out()

NameError: name 'X_train' is not defined

In [224]:
y = data['price']
x = data[skb_cols]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

lin_reg = LinearRegression()
lin_reg = lin_reg.fit(X_train, y_train)
y_test_pred = lin_reg.predict(X_test)

print('Test MAE: {:.3f}'.format(round(mean_absolute_error(y_test, y_test_pred))))

Test MAE: 5113.000
