In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.datasets import load_boston

warnings.filterwarnings('ignore')

# Task 1

### 1. Create dataframes

In [2]:
boston = load_boston()

In [3]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [4]:
data = boston['data']
data[:2]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00]])

In [5]:
feature_names = boston['feature_names']
feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [6]:
target = boston['target']
target[:10]

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9])

In [7]:
X = pd.DataFrame(data, columns=feature_names)
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [8]:
y = pd.DataFrame(target, columns=['price'])
y.info()
y.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   price   506 non-null    float64
dtypes: float64(1)
memory usage: 4.1 KB


Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


### 2. Splitting the sample into training and test

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 3. Build model

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
lr = LinearRegression()

In [13]:
lr.fit(X_train, y_train)

LinearRegression()

### 4. Prediction

In [14]:
y_pred = lr.predict(X_test)

In [15]:
y_pred[:10], y_pred.shape

(array([[28.64896005],
        [36.49501384],
        [15.4111932 ],
        [25.40321303],
        [18.85527988],
        [23.14668944],
        [17.3921241 ],
        [14.07859899],
        [23.03692679],
        [20.59943345]]),
 (152, 1))

In [16]:
y_pred = y_pred.flatten()
y_pred[:10]

array([28.64896005, 36.49501384, 15.4111932 , 25.40321303, 18.85527988,
       23.14668944, 17.3921241 , 14.07859899, 23.03692679, 20.59943345])

In [17]:
check_test = pd.DataFrame({'y_test': y_test['price'], 'y_pred': y_pred})
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,28.64896
274,32.4,36.495014
491,13.6,15.411193
72,22.8,25.403213
452,16.1,18.85528
76,20.0,23.146689
316,17.8,17.392124
140,14.0,14.078599
471,19.6,23.036927
500,16.8,20.599433


### 5. $R^2$

In [18]:
from sklearn.metrics import r2_score

In [19]:
r1 = r2_score(y_test, y_pred)
r1

0.711226005748496

# Task 2

In [20]:
from sklearn.ensemble import RandomForestRegressor

### 1. Build model

In [21]:
model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)

In [22]:
y_train = y_train.values[:, 0]

In [23]:
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)

### 2. Prediction

In [24]:
y_pred = model.predict(X_test)

In [25]:
y_pred = y_pred.flatten()
y_pred[:10]

array([22.80641237, 31.13146352, 16.33912494, 23.81072598, 17.13952074,
       21.8322837 , 19.89574701, 14.75411774, 21.2408346 , 20.89865761])

In [26]:
check_test = pd.DataFrame({'y_test': y_test['price'], 'y_pred': y_pred})
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,22.806412
274,32.4,31.131464
491,13.6,16.339125
72,22.8,23.810726
452,16.1,17.139521
76,20.0,21.832284
316,17.8,19.895747
140,14.0,14.754118
471,19.6,21.240835
500,16.8,20.898658


### 3. $R^2$

In [27]:
r2 = r2_score(y_test, y_pred)
r2

0.87472606157312

In [28]:
r2 - r1  # Предсказание второй модели лучше

0.163500055824624

# Task 3

In [29]:
?RandomForestRegressor

In [30]:
importances = model.feature_importances_
importances

array([0.03167574, 0.00154252, 0.00713813, 0.00123624, 0.01426897,
       0.40268179, 0.01429864, 0.06397257, 0.00528122, 0.01152493,
       0.01808108, 0.01245085, 0.41584732])

In [31]:
np.sum(importances)

1.0

In [32]:
importances.shape[0]

13

In [33]:
feature_names.shape[0]

13

In [34]:
imp_df = pd.DataFrame({'feature_names': feature_names, 'importances': importances}, columns=['feature_names', 'importances'])
imp_df.sort_values(by=['importances'], ascending=False, ignore_index=True)[:2]

Unnamed: 0,feature_names,importances
0,LSTAT,0.415847
1,RM,0.402682


# Task 4

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [36]:
DATASET_PATH = './csv/creditcard.csv'
data = pd.read_csv(DATASET_PATH, sep=',')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [42]:
df[['Amount', 'V5', 'V6']].value_counts(normalize=True)

Amount    V5           V6        
8.99      -0.562777    -1.011073     0.000270
1.00       2.463072     3.173856     0.000270
8.99       0.298310    -0.953526     0.000218
1.00       1.326623     3.436312     0.000211
1.98       2.609358     3.142642     0.000186
                                       ...   
10.32     -0.568678    -0.762763     0.000004
          -0.476011     1.228573     0.000004
          -0.145921     0.676870     0.000004
           0.054512    -0.510631     0.000004
25691.16  -113.743307   73.301626    0.000004
Length: 275663, dtype: float64

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [44]:
pd.options.display.max_columns = 100

In [45]:
df.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,1.341262,0.359894,-0.358091,-0.137134,0.517617,0.401726,-0.058133,0.068653,-0.033194,0.084968,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,-0.099254,-1.416907,-0.153826,-0.751063,0.167372,0.050144,-0.443587,0.002821,-0.611987,-0.045575,-0.219633,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,-0.619468,0.291474,1.757964,-1.323865,0.686133,-0.076127,-1.222127,-0.358222,0.324505,-0.156742,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.41043,-0.705117,-0.110452,-0.286254,0.074355,-0.328783,-0.210077,-0.499768,0.118765,0.570328,0.052736,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,1.017614,0.83639,1.006844,-0.443523,0.150219,0.739453,-0.54098,0.476677,0.451773,0.203711,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [48]:
X = df.drop(columns=['Class'])

In [49]:
y = pd.Series(df['Class'])
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [51]:
type(y)

pandas.core.series.Series

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [69]:
X_train.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0
mean,94810.339675,0.002243,-0.001196,0.00081,0.001849,0.002252,0.00086,-0.002396,-0.002061,-0.001636,-0.002549,-0.001743,0.001404,-0.001305,4e-05,6.8e-05,0.000483,0.00163,0.000303,-2e-05,-0.000126,-3.5e-05,-0.000532,-0.001047,-0.00069,0.000685,-0.000254,0.00012,0.000314,88.139797
std,47474.327394,1.957553,1.652392,1.512832,1.417121,1.365517,1.325449,1.21874,1.208945,1.096996,1.082398,1.018143,0.994797,0.995828,0.955804,0.915469,0.87172,0.84163,0.838102,0.814197,0.760362,0.746103,0.726956,0.625258,0.606662,0.521484,0.481503,0.396735,0.329149,242.462066
min,0.0,-56.40751,-72.715728,-32.965346,-5.683171,-42.147898,-26.160506,-43.557242,-73.216718,-13.434066,-24.588262,-4.56839,-18.683715,-5.791881,-19.214325,-4.498945,-14.129855,-23.815636,-9.498746,-7.213527,-25.222345,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.53433,-9.895244,-9.617915,0.0
25%,54173.25,-0.917434,-0.598699,-0.888159,-0.84966,-0.691944,-0.767172,-0.554808,-0.208512,-0.644613,-0.537343,-0.762321,-0.404682,-0.6502,-0.425083,-0.582549,-0.468477,-0.483477,-0.498889,-0.45706,-0.211651,-0.228272,-0.542095,-0.162,-0.355339,-0.316379,-0.326857,-0.070775,-0.052928,5.74
50%,84654.5,0.020646,0.064743,0.18118,-0.019098,-0.05528,-0.271647,0.039937,0.0229,-0.051257,-0.093973,-0.034723,0.139429,-0.015088,0.050493,0.048497,0.06572,-0.064986,-0.00291,0.004455,-0.062606,-0.029201,0.006807,-0.011571,0.040565,0.017818,-0.052509,0.001418,0.011377,22.115
75%,139366.25,1.314864,0.801737,1.029009,0.746461,0.612686,0.402208,0.570538,0.328388,0.596602,0.453867,0.737656,0.619286,0.662928,0.491942,0.6494,0.522746,0.401246,0.501645,0.457607,0.133509,0.186293,0.527498,0.147256,0.439859,0.351807,0.240816,0.091009,0.077949,77.34
max,172792.0,2.45493,22.057729,4.226108,16.715537,34.801666,23.917837,44.054461,20.007208,10.392889,15.331742,12.018913,7.848392,4.465413,10.526766,5.825654,6.442798,9.253526,5.041069,5.591971,39.420904,27.202839,10.50309,22.528412,4.022866,7.519589,3.517346,12.152401,33.847808,19656.53


In [75]:
type(X_train), type(X_test)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

In [74]:
type(y_train), type(y_test)

(pandas.core.series.Series, pandas.core.series.Series)

In [78]:
X_train.shape, X_test.shape

((199364, 30), (85443, 30))

In [79]:
y_train.shape, y_test.shape

((199364,), (85443,))

In [88]:
parameters = [{
    'n_estimators': [10, 15],
    'max_features': np.arange(3, 5),
    'max_depth': np.arange(4, 7)
}]

In [89]:
clf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=100),
    param_grid=parameters,
    scoring='roc_auc',
    cv=3
)

In [90]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),
             param_grid=[{'max_depth': array([4, 5, 6]),
                          'max_features': array([3, 4]),
                          'n_estimators': [10, 15]}],
             scoring='roc_auc')

In [95]:
clf.best_params_

{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}

In [100]:
y_pred = clf.predict_proba(X_test)

In [101]:
y_pred

array([[9.99070828e-01, 9.29171738e-04],
       [9.99704794e-01, 2.95206364e-04],
       [9.99717846e-01, 2.82154033e-04],
       ...,
       [9.99717846e-01, 2.82154033e-04],
       [9.99317795e-01, 6.82204754e-04],
       [9.87539019e-01, 1.24609813e-02]])

In [105]:
y_pred_proba = y_pred[:, 1]
y_pred_proba.shape

(85443,)

In [106]:
from sklearn.metrics import roc_auc_score

In [107]:
roc_auc_score(y_test, y_pred_proba)

0.9462664156037156