# Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, PolynomialFeatures
from sklearn.utils.fixes import loguniform

import warnings
warnings.filterwarnings('ignore')

# Определение функций

In [2]:
def calculate_f1_score(model_pipe, X, y):
    y_model = model_pipe.predict(X)
    return f1_score(y, y_model)

In [3]:
def weights_vis(B, B0, features_names):
    numbers = np.arange(0,len(features_names) + 1)
    Bs = np.hstack([B0,B])
    tick_labels = ['B'+str(num) for num in numbers]
    cc = ['']*len(numbers)

    for n,val in enumerate(Bs):
        if val<0:
            cc[n] = 'red'
        elif val >= 0:
            cc[n] = 'blue'

    plt.bar(x = numbers, height = Bs, color = cc)
    plt.xticks(np.arange(0,len(features_names)+1),np.hstack(['b0', features_names]),rotation = 45 );

In [4]:
def split_column_by_types(df):
    category_columns = []
    number_columns = []

    for column in df.columns:
        if df[column].dtypes == object or df[column].dtypes == 'category':
            category_columns += [column]
        else:
            number_columns += [column]
    
    return category_columns, number_columns

# Загрузка данных

In [5]:
car_df = pd.read_csv('data/cars_moldova_data.csv', delimiter=',')
car_df.sample(5)

Unnamed: 0,Make,Model,Year,Style,Distance,Engine_capacity,Fuel_type,Transmission,Price,Age,Km_year,Km_year_quantile,Make_model,Make_model_mean_price,Make_model_median_price
15329,Opel,Meriva,2011,Minivan,117000.0,1300.0,Diesel,Manual,5200.0,11,10636.363636,Небольшой пробег,Opel Meriva,4315.808824,4150.0
26198,Mitsubishi,Outlander,2014,SUV,185000.0,2000.0,Plug-in Hybrid,Automatic,13500.0,8,23125.0,Большой пробег,Mitsubishi Outlander,11599.603604,12849.5
35294,Dacia,Sandero,2010,Hatchback,99200.0,1200.0,Petrol,Manual,4399.0,12,8266.666667,Небольшой пробег,Dacia Sandero,3932.377778,3900.0
17371,Opel,Zafira,2007,Minivan,290000.0,1900.0,Diesel,Manual,3700.0,15,19333.333333,Большой пробег,Opel Zafira,4332.403509,3800.0
14647,Nissan,Micra,2002,Hatchback,122322.0,1400.0,Petrol,Automatic,2199.0,20,6116.1,Небольшой пробег,Nissan Micra,3256.314815,2900.0


# Оптимизация линейной регрессии

## Выделим числовые и категориальные столбцы

In [6]:
category_columns, number_columns = split_column_by_types(car_df)
number_columns.remove('Price')

print(category_columns)
print(number_columns)

['Make', 'Model', 'Style', 'Fuel_type', 'Transmission', 'Km_year_quantile', 'Make_model']
['Year', 'Distance', 'Engine_capacity', 'Age', 'Km_year', 'Make_model_mean_price', 'Make_model_median_price']


## Создадим набор данных только с числовыми признаками

In [7]:
car_df_all = car_df.copy()
car_df_all.sample(5)

Unnamed: 0,Make,Model,Year,Style,Distance,Engine_capacity,Fuel_type,Transmission,Price,Age,Km_year,Km_year_quantile,Make_model,Make_model_mean_price,Make_model_median_price
28071,Toyota,Rav 4,2014,Crossover,82000.0,2000.0,Petrol,Automatic,17450.0,8,10250.0,Небольшой пробег,Toyota Rav 4,14333.807808,14999.0
31182,Chrysler,Grand Voyager,2003,Minivan,330000.0,2500.0,Diesel,Manual,4000.0,19,17368.421053,Большой пробег,Rare & Low,3759.866667,3000.0
24822,KIA,Carnival,2021,Minivan,20000.0,2151.0,Diesel,Automatic,29699.0,1,20000.0,Большой пробег,KIA Carnival,19358.673913,18349.0
4144,Renault,Talisman,2017,Sedan,179900.0,1461.0,Diesel,Automatic,11900.0,5,35980.0,Большой пробег,Renault Talisman,12877.471429,12949.5
19285,Hyundai,Altele,2021,Minivan,4000.0,2199.0,Diesel,Automatic,38299.0,1,4000.0,Небольшой пробег,Rare & High,30337.238095,33499.0


## Выделим значения признаков и целевой признак стоимости автомобиля

In [8]:
X_linear, y_linear = car_df_all.drop(columns=['Price']), car_df['Price']

## Разделим набор данных на обучаемый и тестовый

In [9]:
X_linear_train, X_linear_test, y_linear_train, y_linear_test = train_test_split(
    X_linear,
    y_linear,
    test_size=0.3,
    random_state=42
)

## Pipeline для числовых признаков

In [10]:
number_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('PCA', PCA(n_components=2))
])

## Pipeline для категориальных признаков

In [11]:
category_pipeline = Pipeline([
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse=False))
])

## Предобработка данных

In [12]:
preprocessors_linear = ColumnTransformer(transformers=[
    ('num', number_pipeline, number_columns),
    ('cat', category_pipeline, category_columns)
])

In [13]:
preprocessors_linear.fit(X_linear_train);

In [14]:
category_ohe_columns = preprocessors_linear.transformers_[1][1]['encoder'].get_feature_names_out(category_columns)
pca_columns = ['pca-1','pca-2']

## Преобразуем обучаемые данные

In [15]:
X_linear_train_tansform = preprocessors_linear.transform(X_linear_train)
car_linear_df = pd.DataFrame(X_linear_train_tansform, columns=np.append(pca_columns, category_ohe_columns)).sample(5)

## Преобразуем тестовые данные

In [16]:
X_linear_test_tansform = preprocessors_linear.transform(X_linear_test)

pd.DataFrame(X_linear_test_tansform, columns=np.append(pca_columns, category_ohe_columns)).sample(5)
display(car_linear_df)

Unnamed: 0,pca-1,pca-2,Make_ARO,Make_Acura,Make_Alfa Romeo,Make_Alta marca,Make_Audi,Make_BMW,Make_Brilliance,Make_Buick,...,Make_model_Volkswagen Tiguan,Make_model_Volkswagen Touareg,Make_model_Volkswagen Touran,Make_model_Volkswagen Transporter,Make_model_Volkswagen Vento,Make_model_Volvo S60,Make_model_Volvo S80,Make_model_Volvo V60,Make_model_Volvo XC60,Make_model_Volvo XC90
22955,-0.193631,-0.139545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3411,-0.190382,0.103429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14978,-0.084082,0.069239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21068,0.090392,0.029033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4142,0.09897,0.041637,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# weights_vis(X_logistic_test_tansform.intercept_, X_logistic_test_tansform.coef_, car_linear_df.columns)
# PS. Закомментировано, т. к. большет число признаков (1054) и, соответственно, малоинформативный график

## Объединяем предварительнрую обработку и модель

In [18]:
pipeline_linear = Pipeline([
    ('preprocessors', preprocessors_linear),
    ('model', Ridge(max_iter=10000))
])

In [19]:
pipeline_linear.fit(X_linear_train, y_linear_train);

In [20]:
search_params = {
    # 'model__alpha':  np.arange(1, 100, 1)
    'model__alpha': 10**np.linspace(10, -2 , 100) * 0.5
}

pipeline_grid_search = GridSearchCV(
    estimator=pipeline_linear,
    param_grid=search_params,
    cv=ShuffleSplit(n_splits=5, random_state=42),
    verbose=0
)

pipeline_grid_search.fit(X_linear_train, y_linear_train);

In [21]:
pipeline_grid_search.best_estimator_

In [22]:
pipeline_grid_search.best_params_

{'model__alpha': 1.004616501282523}

In [23]:
pipeline_grid_search.best_score_

0.7128476899579461

In [24]:
pipeline_grid_search_df = pd.DataFrame(pipeline_grid_search.cv_results_)
pipeline_grid_search_df.columns

pipeline_grid_search_df_ = pipeline_grid_search_df[[
    'param_model__alpha',
    'mean_test_score',
    'std_test_score',
    'rank_test_score'
]]

pipeline_grid_search_df_.sort_values(by='rank_test_score').style.background_gradient(cmap=sns.light_palette("green", as_cmap=True))

Unnamed: 0,param_model__alpha,mean_test_score,std_test_score,rank_test_score
80,1.004617,0.712848,0.026466,1
79,1.328044,0.712832,0.025948,2
81,0.759956,0.712673,0.026948,3
78,1.755596,0.712578,0.025404,4
82,0.574878,0.71236,0.027384,5
77,2.320794,0.712047,0.024844,6
83,0.434875,0.711963,0.02777,7
84,0.328967,0.711526,0.0281,8
76,3.067954,0.711209,0.024277,9
85,0.248851,0.711087,0.028378,10


# Оптимизация логистической регрессии

## Выделим числовые и категориальные столбцы

In [25]:
category_columns, number_columns = split_column_by_types(car_df)
category_columns.remove('Transmission')

print(category_columns)
print(number_columns)

['Make', 'Model', 'Style', 'Fuel_type', 'Km_year_quantile', 'Make_model']
['Year', 'Distance', 'Engine_capacity', 'Price', 'Age', 'Km_year', 'Make_model_mean_price', 'Make_model_median_price']


## Выделим значения признаков и целевой признак трансмиссии автомобиля

In [26]:
car_df_all = car_df.copy()
X_logistic_features, y_logistic_target = car_df_all.drop(columns=['Transmission']), car_df['Transmission']

## Преобразование целевого признака из категориального в числовой тип

In [27]:
y_logistic_encoder = LabelEncoder()

y_logistic_encoder.fit(y_logistic_target)
y_logistic_label = y_logistic_encoder.transform(y_logistic_target)

## Разделим набор данных на обучаемый и тестовый

In [28]:
X_logistic_train, X_logistic_test, y_logistic_train, y_logistic_test = train_test_split(
    X_logistic_features,
    y_logistic_label,
    test_size=0.3,
    random_state=42
)

## Pipeline для числовых признаков

In [29]:
number_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('polinom', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False))
])

## Pipeline для категориальных признаков

In [30]:
category_pipeline = Pipeline([
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse=False))
])

## Предобработка данных

In [31]:
preprocessors_logistic = ColumnTransformer(transformers=[
    ('num', number_pipeline, number_columns),
    ('cat', category_pipeline, category_columns)
])

In [32]:
preprocessors_logistic.fit(X_logistic_train);

In [33]:
category_ohe_columns = preprocessors_logistic.transformers_[1][1]['encoder'].get_feature_names_out(category_columns)
polynom_columns = preprocessors_logistic.transformers_[0][1]['polinom'].get_feature_names_out(number_columns)

## Преобразуем обучаемые данные

In [34]:
X_logistic_train_tansform = preprocessors_logistic.transform(X_logistic_train)
car_logistic_df = pd.DataFrame(X_logistic_train_tansform, columns=np.append(polynom_columns, category_ohe_columns)).sample(5)

## Преобразуем тестовые данные

In [35]:
X_logistic_test_tansform = preprocessors_logistic.transform(X_logistic_test)
pd.DataFrame(X_logistic_test_tansform, columns=np.append(polynom_columns, category_ohe_columns)).sample(5)

Unnamed: 0,Year,Distance,Engine_capacity,Price,Age,Km_year,Make_model_mean_price,Make_model_median_price,Year^2,Year Distance,...,Make_model_Volkswagen Tiguan,Make_model_Volkswagen Touareg,Make_model_Volkswagen Touran,Make_model_Volkswagen Transporter,Make_model_Volkswagen Vento,Make_model_Volvo S60,Make_model_Volvo S80,Make_model_Volvo V60,Make_model_Volvo XC60,Make_model_Volvo XC90
6043,0.78,0.163916,0.270833,0.07014,0.22,0.271877,0.168351,0.137782,0.6084,0.127855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4303,0.68,0.19992,0.375,0.013026,0.32,0.233762,0.049649,0.048599,0.4624,0.135946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9291,0.66,0.193753,0.270833,0.026052,0.34,0.213799,0.028412,0.026872,0.4356,0.127877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3799,0.78,0.20503,0.251042,0.078156,0.22,0.340529,0.122678,0.127501,0.6084,0.159923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4949,0.72,0.223922,0.291667,0.061122,0.28,0.297261,0.061472,0.056032,0.5184,0.161224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# weights_vis(X_logistic_test_tansform.intercept_, X_logistic_test_tansform.coef_, car_logistic_df.columns)
# PS. Закомментировано, т. к. большет число признаков (1054) и, соответственно, малоинформативный график

## Объединяем предварительнрую обработку и модель

In [37]:
pipeline_logistic = Pipeline([
    ('preprocessors', preprocessors_logistic),
    ('model', LogisticRegression(C=0.001, random_state=42, solver='liblinear'))
])

In [38]:
pipeline_logistic.fit(X_logistic_train, y_logistic_train);

In [39]:
print(f"F1 на тренировочной выборке: {calculate_f1_score(pipeline_logistic, X_logistic_train, y_logistic_train):.4f}")
print(f"F1 на валидационной выборке: {calculate_f1_score(pipeline_logistic, X_logistic_test, y_logistic_test):.4f}")

F1 на тренировочной выборке: 0.8361
F1 на валидационной выборке: 0.8368


In [40]:
grid_params = {
    'model__C': np.logspace(-4, 4, 5),
    'model__penalty': ['l1', 'l2']
}

In [41]:
pipeline_grid_search = GridSearchCV(
    estimator=pipeline_logistic,
    param_grid=grid_params,
    cv=ShuffleSplit(n_splits=5, random_state=42),
    verbose=0
)

pipeline_grid_search.fit(X_logistic_train, y_logistic_train);

In [42]:
pipeline_grid_search.best_estimator_

In [43]:
pipeline_grid_search.best_params_

{'model__C': 100.0, 'model__penalty': 'l1'}

In [44]:
pipeline_grid_search.best_score_

0.8717330116606353

In [45]:
pipeline_grid_search_df = pd.DataFrame(pipeline_grid_search.cv_results_)
pipeline_grid_search_df.columns

pipeline_grid_search_df_ = pipeline_grid_search_df[[
    'param_model__penalty',
    'param_model__C',
    'mean_test_score',
    'std_test_score',
    'rank_test_score'
]]

pipeline_grid_search_df_.sort_values(by='rank_test_score').style.background_gradient(cmap=sns.light_palette("green", as_cmap=True))

Unnamed: 0,param_model__penalty,param_model__C,mean_test_score,std_test_score,rank_test_score
6,l1,100.0,0.871733,0.003798,1
9,l2,10000.0,0.87109,0.003743,2
7,l2,100.0,0.871009,0.004516,3
8,l1,10000.0,0.870929,0.003668,4
4,l1,1.0,0.869964,0.00401,5
5,l2,1.0,0.861681,0.00344,6
3,l2,0.01,0.845195,0.007384,7
2,l1,0.01,0.836429,0.006242,8
1,l2,0.0001,0.797266,0.006278,9
0,l1,0.0001,0.462565,0.003725,10
