In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Загрузка данных

In [None]:
%%capture
!wget https://www.dropbox.com/s/64ol9q9ssggz6f1/data_ford_price.xlsx

In [2]:
data = pd.read_excel('data/data_ford_price.xlsx') 

#  Отбор признаков: мотивация

## Предобработка данных

In [3]:
data = data[['price','year', 'cylinders', 'odometer', 'lat', 'long', 'weather']]
data.dropna(inplace = True)

y = data['price']
x = data.drop(columns='price')

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

## Обучение модели

In [4]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4682.957


## Удаление избыточного признака

In [45]:
x.drop('lat', axis = 1, inplace = True)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [47]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4672.930


#  Отбор признаков: классификация методов

## Метод рекурсивного исключения признаков

In [48]:
from sklearn.feature_selection import RFE

In [49]:
y = data['price']
x = data.drop(columns='price')

In [50]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [51]:
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'lat'], dtype=object)

In [39]:
X_train[selector.get_feature_names_out()].head()

Unnamed: 0,year,cylinders,lat
4641,2011,6,31.316064
3188,2010,8,42.892121
135,2008,8,35.887189
962,2011,8,43.78124
6188,2006,8,42.484503


Обучаем модель на отобранных признаках

In [52]:
model = LinearRegression()
model.fit(X_train[selector.get_feature_names_out()], y_train)
y_predicted = model.predict(X_test[selector.get_feature_names_out()])
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 5096.570


##  МЕТОДЫ ВЫБОРА ПРИЗНАКОВ НА ОСНОВЕ ФИЛЬТРОВ

In [41]:
from sklearn.feature_selection import SelectKBest, f_regression

In [42]:
selector = SelectKBest(f_regression, k=3)# )
selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'odometer'], dtype=object)

In [43]:
X_train[selector.get_feature_names_out()].head()

Unnamed: 0,year,cylinders,odometer
4641,2011,6,122882
3188,2010,8,144570
135,2008,8,136124
962,2011,8,136668
6188,2006,8,187000


Обучаем модель на отобранных признаках

In [44]:
model = LinearRegression()
model.fit(X_train[selector.get_feature_names_out()], y_train)
y_predicted = model.predict(X_test[selector.get_feature_names_out()])
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4708.946


Вывод: признаки отобранные на основе фильтров( коэффициент корреляции Пирсона) показал результат лучше, чем вариант с отобранными признаками на основе метода рекурсивного исключения признаков (RFE).
Значения МАЕ 4708.946 против  5096.570 соответственно.

