# Лабораторная работа №1 "Деревья решений, случайный лес и градиентный бустинг"

In [None]:
from re import Match
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

In [None]:
UNITS_PATTERN = re.compile(r'^(\d+(?:\.\d+)?) *(\w*?)$')
PRICE_PATTERN = re.compile(r'^\$?([\d,]+(?:\.\d+)?)\$?$')
OS_PATTERN = re.compile(r'^(\w{3}).*?(\d+)?$')


def reduce_units(val: str | int | float, units: dict[str, float], default_unit: str) ->  float:
    match = __find_match(val, UNITS_PATTERN)
    if match is None:
        return math.nan

    number = float(match[1])
    unit = units[match[2] or default_unit]

    return number * unit


def reduce_price(val: str | int | float) -> float:
    match = __find_match(val, PRICE_PATTERN)
    if match is None:
        return math.nan

    return float(match[1].replace(',', ''))


def reduce_os(val: str | int | float) -> str | None:
    match = __find_match(val, OS_PATTERN)
    if match is None:
        return None

    return f'{match[1]} {match[2] or ""}'.strip()


def __find_match(val: str | int | float, pattern: re.Pattern) -> Match[str] | None:
    if pd.isna(val):
        return None

    val = str(val) if not isinstance(val, str) else val
    return pattern.match(val.strip().lower())

In [None]:
df = pd.read_csv('data/amazon_laptop_prices_v01.tar.gz')
df.info()

In [None]:
for column in df.columns:
    print(column, df[column].unique(), sep='\n')

In [None]:
df = df[df['price'].notna()]
df['price'] = df['price'].apply(reduce_price)

df['brand'] = df['brand'].str.lower()

df['screen_size'] = df['screen_size'].apply(lambda x: reduce_units(x, {
    'inches' : 1
}, 'inches'))
df['screen_size'] = df['screen_size'].fillna(df['screen_size'].mean())

for column in {'harddisk', 'ram'}:
    df[column] = df[column].apply(lambda x: reduce_units(x, {
        'mb': 1e-3,
        'gb': 1,
        'tb': 1e3
    }, 'gb'))
    df[column] = df[column].fillna(df[column].mean())

df['cpu_speed'] = df['cpu_speed'].apply(lambda x: reduce_units(x, {
    'hz': 1e-6,
    'mhz': 1e-3,
    'ghz': 1
}, 'ghz'))
df['cpu_speed'] = df['cpu_speed'].fillna(df['cpu_speed'].mean())

df['rating'] = df['rating'].fillna(df['rating'].mean())

df['OS'] = df['OS'].apply(reduce_os)
df['OS'] = df['OS'].fillna('unk')

df['special_features'] = df['special_features'].fillna('').apply(lambda x: x.count(','))

df = df.drop(['model', 'color', 'cpu', 'ram', 'graphics', 'graphics_coprocessor'], axis=1)

In [None]:
df.info()

In [None]:
df = pd.concat([df, pd.get_dummies(df['brand'], prefix="brand"), pd.get_dummies(df['OS'], prefix="OS")], axis = 1)
df = df.drop(['brand', 'OS'], axis=1)

In [None]:
df.info()

In [None]:
TRAIN_PROPORTION = 2e-1
RANDOM_SEED = 42

y = df['price']
X = df.drop(['price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TRAIN_PROPORTION, random_state=RANDOM_SEED)

In [None]:
trees = (DecisionTreeRegressor(max_depth=i, random_state=RANDOM_SEED) for i in range(1, 11))
best_tree = max(trees, key=lambda x: x.fit(X_train, y_train).score(X_test, y_test))  # type: ignore

plt.figure(figsize=((30,10)))
plot_tree(best_tree, filled=True, feature_names=X.columns)  # type: ignore
plt.savefig('Decision tree.pdf')

In [None]:
random_forests = (RandomForestRegressor(n_estimators=i, max_depth=j, random_state=RANDOM_SEED) for i in range(1, 41) for j in range(1, 11))
best_random_forest = max(random_forests, key=lambda x: x.fit(X_train, y_train).score(X_test, y_test))  # type: ignore

In [None]:
gradient_boostings = (GradientBoostingRegressor(n_estimators=i, max_depth=j, random_state=RANDOM_SEED) for i in range(1, 41) for j in range(1, 11))
best_gradient_boostings = max(gradient_boostings, key=lambda x: x.fit(X_train, y_train).score(X_test, y_test))  # type: ignore

![Mean absolute error formula](https://machinelearningmastery.ru/img/0-411862-507651.png)

In [None]:
models = (best_tree, best_random_forest, best_gradient_boostings)

for model in models:
    print('Model:', repr(model))
    print('Score:', model.score(X_test, y_test))

    y_pred = model.predict(X_test)
    print('Mean absolute error:', mean_absolute_error(y_test, y_pred))

    plt.figure(figsize=(7, 7))
    plt.scatter(y_test, y_pred)
    plt.plot([0, max(y_test)], [0, max(y_pred)])
    plt.xlabel('Настоящая цена', fontsize=15)
    plt.ylabel('Предсказанная цена', fontsize=15)
    plt.show()

    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(15, 8))
    plt.ylabel('Важность')
    plt.xlabel('Название признака')
    plt.bar(X_test.columns[indices], importances[indices])
    plt.xticks(rotation=90)
    plt.show()