In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, f1_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [95]:
RANDOM_STATE = 42

results_regression = pd.DataFrame(columns = ['model', 'task', 'R2'])
results_classification = pd.DataFrame(columns = ['model', 'task', 'f1', 'accuracy'])

https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

In [96]:
data = pd.read_csv('boston.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%, предварительно выделив целевую переменную (колонка 'MEDV').

In [97]:
x = data.drop('MEDV',axis=1)
y = data['MEDV']

In [98]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404,), (102,))

2. Обучите стандартную регрессию, а также Ridge и  Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

In [99]:
LinR = LinearRegression()
LinR.fit(x_train, y_train)
y_pred = LinR.predict(x_test)
r2_lr = r2_score(y_test,y_pred)

Rid = Ridge()
Rid.fit(x_train, y_train)
y_pred = Rid.predict(x_test)
r2_ridge = r2_score(y_test,y_pred)

Las = Lasso()
Las.fit(x_train, y_train)
y_pred = Las.predict(x_test)
r2_lasso = r2_score(y_test,y_pred)

### Ваш код ###

#r2_lr = 0
#r2_ridge = 0
#r2_lasso = 0
results_regression.loc[0] = ['LR', 'task2', r2_lr]
results_regression.loc[1] = ['Ridge', 'task2', r2_ridge]
results_regression.loc[2] = ['Lasso', 'task2', r2_lasso]
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869


3. Для Ridge и Lasso подберите коэффициент регуляризации двумя способами 1) GridSearchCV, 2) RidgeCV и LassoCV, в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по всем моделям и сравните с предыдущими результатами.

In [100]:
parameters = {'alpha': [10**i for i in range(-5, 6)]}
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, parameters, scoring ='r2', cv = 5)
ridge_grid.fit(x_train,y_train)
ridge_best = ridge_grid.best_params_['alpha']
ridge_total = Ridge(alpha = ridge_best)
ridge_total.fit(x_train,y_train)
y_pred = ridge_total.predict(x_test)
r2_ridge_grid_search = r2_score(y_test, y_pred)

parameters = {'alpha': [10**i for i in range(-5, 6)]}
lasso = Lasso()
lasso_grid = GridSearchCV(lasso, parameters, scoring ='r2', cv = 5)
lasso_grid.fit(x_train,y_train)
lasso_best = lasso_grid.best_params_['alpha']
lasso_total = Lasso(alpha = lasso_best)
lasso_total.fit(x_train,y_train)
y_pred = lasso_total.predict(x_test)
r2_lasso_grid_search = r2_score(y_test, y_pred)

parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
r2_ridge_cv = r2_score(y_test, y_pred)

parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
r2_lasso_cv = r2_score(y_test, y_pred)


### Ваш код ###

#r2_ridge_grid_search = 0
#r2_ridge_cv = 0
#r2_lasso_grid_search = 0
#r2_lasso_cv = 0
results_regression.loc[3] = ['Ridge_GridSearchCV', 'task3', r2_ridge_grid_search]
results_regression.loc[4] = ['RidgeCV', 'task3', r2_ridge_cv]
results_regression.loc[5] = ['Lasso_GridSearchCV', 'task3', r2_lasso_grid_search]
results_regression.loc[6] = ['LassoCV', 'task3', r2_lasso_cv]
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483


4. Проведите масштабирование выборки (используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [101]:
ridge_pipeline_standard = Pipeline([('scaler', StandardScaler()),('ridge', Ridge())])

#PL с MinMaxScaler и Ridge
ridge_pipeline_minmax = Pipeline([('scaler', MinMaxScaler()),('ridge', Ridge())])

#PL с StandardScaler и Lasso
lasso_pipeline_standard = Pipeline([('scaler', StandardScaler()),('lasso', Lasso())])

#PL с MinMaxScaler и Lasso
lasso_pipeline_minmax = Pipeline([('scaler', MinMaxScaler()),('lasso', Lasso())])

#Ridge с StandardScaler
ridge_pipeline_standard.fit(x_train, y_train)
ridge_standard_pred = ridge_pipeline_standard.predict(x_test)
r2_ridge_standart_scaler = r2_score(y_test, ridge_standard_pred)

#Ridge с MinMaxScaler
ridge_pipeline_minmax.fit(x_train, y_train)
ridge_minmax_pred = ridge_pipeline_minmax.predict(x_test)
r2_ridge_min_max_scaler = r2_score(y_test, ridge_minmax_pred)

#Lasso с StandardScaler
lasso_pipeline_standard.fit(x_train, y_train)
lasso_standard_pred = lasso_pipeline_standard.predict(x_test)
r2_lasso_standart_scaler = r2_score(y_test, lasso_standard_pred)

#Lasso с MinMaxScaler
lasso_pipeline_minmax.fit(x_train, y_train)
lasso_minmax_pred = lasso_pipeline_minmax.predict(x_test)
r2_lasso_min_max_scaler = r2_score(y_test, lasso_minmax_pred)

### Ваш код ###

#r2_ridge_standart_scaler = 0
#r2_ridge_min_max_scaler = 0
#r2_lasso_standart_scaler = 0
#r2_lasso_min_max_scaler = 0
results_regression.loc[7] = ['Ridge_StandardScaler', 'task4', r2_ridge_standart_scaler]
results_regression.loc[8] = ['Ridge_MinMaxScaler', 'task4', r2_ridge_min_max_scaler]
results_regression.loc[9] = ['Lasso_StandardScaler', 'task4', r2_lasso_standart_scaler]
results_regression.loc[10] = ['Lasso_MinMaxScaler', 'task4', r2_lasso_min_max_scaler]
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами.

In [102]:
MMSC = MinMaxScaler()
MMSC.fit(x_train)

x_train_norm = pd.DataFrame(MMSC.transform(x_train), columns = x_train.columns)
x_test_norm = pd.DataFrame(MMSC.transform(x_test), columns = x_train.columns)

scaler = StandardScaler()
scaler.fit(x_train)

x_train_standart = pd.DataFrame(scaler.transform(x_train), columns = x_train.columns)
x_test_standart = pd.DataFrame(scaler.transform(x_test), columns = x_train.columns)


#RidgeCV MinMax
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(x_train_norm,y_train)
y_pred = model.predict(x_test_norm)
r2_ridge_min_max_scaler_cv = r2_score(y_test, y_pred)

#RidgeCV Standart
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(x_train_standart,y_train)
y_pred = model.predict(x_test_standart)
r2_ridge_standart_scaler_cv = r2_score(y_test, y_pred)

#LassoCV MinMax
parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(x_train_norm,y_train)
y_pred = model.predict(x_test_norm)
r2_lasso_min_max_scaler_cv = r2_score(y_test, y_pred)

#LassoCV Standart
parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(x_train_standart,y_train)
y_pred = model.predict(x_test_standart)
r2_lasso_standart_scaler_cv = r2_score(y_test, y_pred)

### Ваш код ###

#r2_ridge_standart_scaler_cv = 0
#r2_ridge_min_max_scaler_cv = 0
#r2_lasso_standart_scaler_cv = 0
#r2_lasso_min_max_scaler_cv = 0
results_regression.loc[11] = ['Ridge_StandardScaler_CV', 'task5', r2_ridge_standart_scaler_cv]
results_regression.loc[12] = ['Ridge_MinMaxScaler_CV', 'task5', r2_ridge_min_max_scaler_cv]
results_regression.loc[13] = ['Lasso_StandardScaler_CV', 'task5', r2_lasso_standart_scaler_cv]
results_regression.loc[14] = ['Lasso_MinMaxScaler_CV', 'task5', r2_lasso_min_max_scaler_cv]
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [103]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

poly = PolynomialFeatures()
x_train_poly = poly.fit_transform(x_train_scaled)
x_test_poly = poly.transform(x_test_scaled)

model = Ridge()
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_ridge_standart_scaler_poly = r2_score(y_test, y_pred)

model = Lasso()
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_lasso_standart_scaler_poly = r2_score(y_test, y_pred)

scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

poly = PolynomialFeatures()
x_train_poly = poly.fit_transform(x_train_scaled)
x_test_poly = poly.transform(x_test_scaled)

model = Ridge()
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_ridge_min_max_scaler_poly = r2_score(y_test, y_pred)

model = Lasso()
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_lasso_min_max_scaler_poly = r2_score(y_test, y_pred)

### Ваш код ###

#r2_ridge_standart_scaler_poly = 0
#r2_ridge_min_max_scaler_poly = 0
#r2_lasso_standart_scaler_poly = 0
#r2_lasso_min_max_scaler_poly = 0
results_regression.loc[15] = ['Ridge_StandardScaler_Poly', 'task6', r2_ridge_standart_scaler_poly]
results_regression.loc[16] = ['Ridge_MinMaxScaler_Poly', 'task6', r2_ridge_min_max_scaler_poly]
results_regression.loc[17] = ['Lasso_StandardScaler_Poly', 'task6', r2_lasso_standart_scaler_poly]
results_regression.loc[18] = ['Lasso_MinMaxScaler_Poly', 'task6', r2_lasso_min_max_scaler_poly]
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


7. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, добавив PolynomialFeatures, посчитайте R2 и сравните с предыдущими результатами.

In [104]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

poly = PolynomialFeatures()
x_train_poly = poly.fit_transform(x_train_scaled)
x_test_poly = poly.transform(x_test_scaled)

# ridge standart
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_ridge_standart_scaler_poly_cv = r2_score(y_test, y_pred)

# Lasso standart
parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_lasso_standart_scaler_poly_cv = r2_score(y_test, y_pred)

scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

poly = PolynomialFeatures()
x_train_poly = poly.fit_transform(x_train_scaled)
x_test_poly = poly.transform(x_test_scaled)

# Ridge minmax
parameters = [10**i for i in range(-5, 6)]
model = RidgeCV(alphas = parameters, cv=5)
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_ridge_min_max_scaler_poly_cv = r2_score(y_test, y_pred)

# Lasso minmax

parameters = [10**i for i in range(-5, 6)]
model = LassoCV(alphas = parameters, cv=5)
model.fit(x_train_poly, y_train)
y_pred = model.predict(x_test_poly)
r2_lasso_min_max_scaler_poly_cv = r2_score(y_test, y_pred)

### Ваш код ###

#r2_ridge_standart_scaler_poly_cv = 0
#r2_ridge_min_max_scaler_poly_cv = 0
#r2_lasso_standart_scaler_poly_cv = 0
#r2_lasso_min_max_scaler_poly_cv = 0
results_regression.loc[19] = ['Ridge_StandardScaler_Poly_CV', 'task7', r2_ridge_standart_scaler_poly_cv]
results_regression.loc[20] = ['Ridge_MinMaxScaler_Poly_CV', 'task7', r2_ridge_min_max_scaler_poly_cv]
results_regression.loc[21] = ['Lasso_StandardScaler_Poly_CV', 'task7', r2_lasso_standart_scaler_poly_cv]
results_regression.loc[22] = ['Lasso_MinMaxScaler_Poly_CV', 'task7', r2_lasso_min_max_scaler_poly_cv]
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


8. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2.

In [105]:
pipeline = Pipeline([('scaler', StandardScaler()),  ('poly', PolynomialFeatures()), ('regression', Ridge()) ])

parameters = {
    'scaler': [StandardScaler(), MinMaxScaler()],  
    'poly__degree': [1, 2],  
    'regression': [Ridge(), Lasso()],  
    'regression__alpha': [10**i for i in range(-5, 6)], 
}


grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='r2')

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
print('Параметры лучшей модели:\n', best_params)
r2_best_model = grid_search.best_score_
results_regression.loc[23] = ['Best_Model', 'task8', r2_best_model]
results_regression

Параметры лучшей модели:
 {'poly__degree': 2, 'regression': Ridge(alpha=0.1), 'regression__alpha': 0.1, 'scaler': MinMaxScaler()}


Unnamed: 0,model,task,R2
0,LR,task2,0.668483
1,Ridge,task2,0.665961
2,Lasso,task2,0.666869
3,Ridge_GridSearchCV,task3,0.668483
4,RidgeCV,task3,0.668483
5,Lasso_GridSearchCV,task3,0.668483
6,LassoCV,task3,0.668483
7,Ridge_StandardScaler,task4,0.66819
8,Ridge_MinMaxScaler,task4,0.676221
9,Lasso_StandardScaler,task4,0.624045


http://archive.ics.uci.edu/ml/datasets/Adult

In [106]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


9. Разделите выборку на признаки и целевую переменную(колонка class). Замените целевую переменную на числовые значения ('<=50K' - 1, '>50K' - 0).

In [107]:
y = data['class'].replace({'<=50K': 1, '>50K': 0})
X = data.drop(columns=['class'])

10. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [108]:
y.value_counts()

### Ваш код ###


1    37155
0    11687
Name: class, dtype: int64

In [109]:
print(f'Accuracy: {accuracy_score(y, np.array([0 for _ in range(len(y))]))}')
print(f'F1-score: {f1_score(y, np.array([0 for _ in range(len(y))]))}')

Accuracy: 0.23928176569346055
F1-score: 0.0


11. Выясните, присутствуют ли в данных пропуски. Если присутствуют, заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [110]:
X.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

12. Выберите колонки с числовыми и категориальными переменными (используя возможности pandas).

In [111]:
numeric_features = []
categorical_features = []
for column in data.columns:
    if column == 14:
        continue
    if data[column].dtype == np.dtype('int64'):
        numeric_features.append(column)
    elif data[column].dtype == np.dtype('object_'):
        categorical_features.append(column)
    else:
        raise Exception('Check data types')

In [112]:
numeric_features

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [113]:
categorical_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'class']

13. Создайте пайплайн по обработке числовых и категориальных значений колонок (используйте OneHotEncoder,MinMaxScaler) и посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями, (испольуйте SimpleImputer). Посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

15. Посчитайте cross_val_score по тем же алгоритмам и метрикам, если просто удалить значения '?'.

 16. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier на данных с замененными значениями '?' на самые частые значения.

17. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [114]:
results_classification

Unnamed: 0,model,task,f1,accuracy
