In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.datasets import load_boston

warnings.filterwarnings('ignore')

# Task 1

### 1. Create dataframes

In [2]:
boston = load_boston()

In [3]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [4]:
data = boston['data']
data[:2]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00]])

In [5]:
feature_names = boston['feature_names']
feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [6]:
target = boston['target']
target[:10]

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9])

In [7]:
X = pd.DataFrame(data, columns=feature_names)
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [8]:
y = pd.DataFrame(target, columns=['price'])
y.info()
y.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   price   506 non-null    float64
dtypes: float64(1)
memory usage: 4.1 KB


Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


### 2. Splitting the sample into training and test

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 3. Build model

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
lr = LinearRegression()

In [13]:
lr.fit(X_train, y_train)

LinearRegression()

### 4. Prediction

In [14]:
y_pred = lr.predict(X_test)

In [15]:
y_pred[:10], y_pred.shape

(array([[28.64896005],
        [36.49501384],
        [15.4111932 ],
        [25.40321303],
        [18.85527988],
        [23.14668944],
        [17.3921241 ],
        [14.07859899],
        [23.03692679],
        [20.59943345]]),
 (152, 1))

In [16]:
y_pred = y_pred.flatten()
y_pred[:10]

array([28.64896005, 36.49501384, 15.4111932 , 25.40321303, 18.85527988,
       23.14668944, 17.3921241 , 14.07859899, 23.03692679, 20.59943345])

In [17]:
check_test = pd.DataFrame({'y_test': y_test['price'], 'y_pred': y_pred})
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,28.64896
274,32.4,36.495014
491,13.6,15.411193
72,22.8,25.403213
452,16.1,18.85528
76,20.0,23.146689
316,17.8,17.392124
140,14.0,14.078599
471,19.6,23.036927
500,16.8,20.599433


### 5. $R^2$

In [18]:
from sklearn.metrics import r2_score

In [19]:
r1 = r2_score(y_test, y_pred)
r1

0.711226005748496

# Task 2

In [20]:
from sklearn.ensemble import RandomForestRegressor

### 1. Build model

In [21]:
model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)

In [22]:
y_train = y_train.values[:, 0]

In [23]:
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)

### 2. Prediction

In [24]:
y_pred = model.predict(X_test)

In [25]:
y_pred = y_pred.flatten()
y_pred[:10]

array([22.80641237, 31.13146352, 16.33912494, 23.81072598, 17.13952074,
       21.8322837 , 19.89574701, 14.75411774, 21.2408346 , 20.89865761])

In [26]:
check_test = pd.DataFrame({'y_test': y_test['price'], 'y_pred': y_pred})
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,22.806412
274,32.4,31.131464
491,13.6,16.339125
72,22.8,23.810726
452,16.1,17.139521
76,20.0,21.832284
316,17.8,19.895747
140,14.0,14.754118
471,19.6,21.240835
500,16.8,20.898658


### 3. $R^2$

In [27]:
r2 = r2_score(y_test, y_pred)
r2

0.87472606157312

In [30]:
r2 - r1  # Предсказание второй модели лучше

0.163500055824624

# Task 3

In [35]:
?RandomForestRegressor

In [42]:
importances = model.feature_importances_
importances

array([0.03167574, 0.00154252, 0.00713813, 0.00123624, 0.01426897,
       0.40268179, 0.01429864, 0.06397257, 0.00528122, 0.01152493,
       0.01808108, 0.01245085, 0.41584732])

In [43]:
np.sum(importances)

1.0

In [55]:
importances.shape[0]

13

In [56]:
feature_names.shape[0]

13

In [54]:
imp_df = pd.DataFrame({'feature_names': feature_names, 'importances': importances}, columns=['feature_names', 'importances'])
imp_df.sort_values(by=['importances'], ascending=False, ignore_index=True)[:2]

Unnamed: 0,feature_names,importances
0,LSTAT,0.415847
1,RM,0.402682
