In [3]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import plotly.express as px
import scipy.stats as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf 

Линейная регрессия – попробуем предсказать стоимость машин и понять, от каких факторов зависит ценообразование на автомобили. 

In [8]:
cars = pd.read_csv('cars.csv')

In [9]:
#проверяем данные на пропущенные значения
cars.isna().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [10]:
#подготовлю данные для анализа, удалю ненужные предикторы
cars['company'] = cars.CarName.apply(lambda x: x.split(' ')[0])
cars=cars.drop(columns=['car_ID','CarName'])
cars.company=cars.company.replace('maxda','mazda')\
                .replace('Nissan','nissan')\
                .replace('porcshce','porsche')\
                .replace('toyouta','toyota')\
                .replace('vokswagen','volkswagen')\
                .replace('vw','volkswagen')\
                .str.lower()
cars=cars.drop(columns=['symboling','doornumber','enginelocation','carheight','fuelsystem','stroke','compressionratio','peakrpm','citympg','highwaympg'])

In [14]:
#рассчитываю корреляцию
cars_corr=cars.corr()
cars_corr.price.sort_values(ascending=False).round(2)

price         1.00
enginesize    0.87
curbweight    0.84
horsepower    0.81
carwidth      0.76
carlength     0.68
wheelbase     0.58
boreratio     0.55
Name: price, dtype: float64

In [16]:
#поправим формат данных для линейной модели
cars_dummy = pd.get_dummies(data=cars[['fueltype','aspiration','carbody','drivewheel','enginetype','cylindernumber','company']],drop_first = True)

In [17]:
cars.dtypes

fueltype           object
aspiration         object
carbody            object
drivewheel         object
wheelbase         float64
carlength         float64
carwidth          float64
curbweight          int64
enginetype         object
cylindernumber     object
enginesize          int64
boreratio         float64
horsepower          int64
price             float64
company            object
dtype: object

In [18]:
cars_for_regress=cars.drop(columns=['fueltype','aspiration','carbody','drivewheel','enginetype','cylindernumber','company'])
new_cars=pd.concat([cars_for_regress, cars_dummy], axis=1)

In [19]:
#датафрейм подготовлен для применения линейной регрессии.
new_cars

Unnamed: 0,wheelbase,carlength,carwidth,curbweight,enginesize,boreratio,horsepower,price,fueltype_gas,aspiration_turbo,...,company_nissan,company_peugeot,company_plymouth,company_porsche,company_renault,company_saab,company_subaru,company_toyota,company_volkswagen,company_volvo
0,88.6,168.8,64.1,2548,130,3.47,111,13495.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,88.6,168.8,64.1,2548,130,3.47,111,16500.0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,94.5,171.2,65.5,2823,152,2.68,154,16500.0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,99.8,176.6,66.2,2337,109,3.19,102,13950.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,99.4,176.6,66.4,2824,136,3.19,115,17450.0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,109.1,188.8,68.9,2952,141,3.78,114,16845.0,1,0,...,0,0,0,0,0,0,0,0,0,1
201,109.1,188.8,68.8,3049,141,3.78,160,19045.0,1,1,...,0,0,0,0,0,0,0,0,0,1
202,109.1,188.8,68.9,3012,173,3.58,134,21485.0,1,0,...,0,0,0,0,0,0,0,0,0,1
203,109.1,188.8,68.9,3217,145,3.01,106,22470.0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [20]:
new_cars.columns

Index(['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize',
       'boreratio', 'horsepower', 'price', 'fueltype_gas', 'aspiration_turbo',
       'carbody_hardtop', 'carbody_hatchback', 'carbody_sedan',
       'carbody_wagon', 'drivewheel_fwd', 'drivewheel_rwd', 'enginetype_dohcv',
       'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four',
       'cylindernumber_six', 'cylindernumber_three', 'cylindernumber_twelve',
       'cylindernumber_two', 'company_audi', 'company_bmw', 'company_buick',
       'company_chevrolet', 'company_dodge', 'company_honda', 'company_isuzu',
       'company_jaguar', 'company_mazda', 'company_mercury',
       'company_mitsubishi', 'company_nissan', 'company_peugeot',
       'company_plymouth', 'company_porsche', 'company_renault',
       'company_saab', 'company_subaru', 'company_toyota',
       'company_volkswagen', 'company_volvo'],
      dtype='object'

Проведу линейную регрессию двух моделей: в обоих моделях зависимый предиктор будет стоимость (Price), независимый - в первой модели проведем со всеми предикторами, во второй модели ниже - уберем марки автомобилей.

In [22]:
#Зависимая переменная
Y2=new_cars.price

In [24]:
#Список независимых переменных
X2=new_cars[['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize',
       'boreratio', 'horsepower', 'fueltype_gas', 'aspiration_turbo',
       'carbody_hardtop', 'carbody_hatchback', 'carbody_sedan',
       'carbody_wagon', 'drivewheel_fwd', 'drivewheel_rwd', 'enginetype_dohcv',
       'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four',
       'cylindernumber_six', 'cylindernumber_three', 'cylindernumber_twelve',
       'cylindernumber_two', 'company_audi', 'company_bmw', 'company_buick',
       'company_chevrolet', 'company_dodge', 'company_honda', 'company_isuzu',
       'company_jaguar', 'company_mazda', 'company_mercury',
       'company_mitsubishi', 'company_nissan', 'company_peugeot',
       'company_plymouth', 'company_porsche', 'company_renault',
       'company_saab', 'company_subaru', 'company_toyota',
       'company_volkswagen', 'company_volvo']]

In [25]:
X1 = sm.add_constant(X2)
model = sm.OLS(Y2,X1)  # говорим модели, что у нас ЗП, а что НП
results = model.fit()  # строим регрессионную прямую
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     81.09
Date:                Tue, 08 Feb 2022   Prob (F-statistic):           4.86e-89
Time:                        17:30:09   Log-Likelihood:                -1804.2
No. Observations:                 205   AIC:                             3702.
Df Residuals:                     158   BIC:                             3858.
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -3.472e+

In [37]:
#Зависимая переменная
Y2=new_cars.price

In [27]:
#Независимые переменные,но без марок машин
X3=new_cars[['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize',
       'boreratio', 'horsepower','fueltype_gas', 'aspiration_turbo',
       'carbody_hardtop', 'carbody_hatchback', 'carbody_sedan',
       'carbody_wagon', 'drivewheel_fwd', 'drivewheel_rwd', 'enginetype_dohcv',
       'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four',
       'cylindernumber_six', 'cylindernumber_three', 'cylindernumber_twelve',
       'cylindernumber_two']]

In [28]:
X2 = sm.add_constant(X3)
model2 = sm.OLS(Y2,X2)  # говорим модели, что у нас ЗП, а что НП
results = model.fit()  # строим регрессионную прямую
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     81.09
Date:                Tue, 08 Feb 2022   Prob (F-statistic):           4.86e-89
Time:                        17:31:04   Log-Likelihood:                -1804.2
No. Observations:                 205   AIC:                             3702.
Df Residuals:                     158   BIC:                             3858.
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -3.472e+

Выбранная модель объясняет примерно 90% дисперсии.
Среди предикторов 10 из 27 оказались не значимыми (p > 0.05). 