# 1.	Выбор начальных условий

In [1]:
import kagglehub
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import warnings

warnings.filterwarnings("ignore")

In [2]:
path = kagglehub.dataset_download("iabhishekofficial/mobile-price-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/iabhishekofficial/mobile-price-classification?dataset_version_number=1...


100%|██████████| 70.6k/70.6k [00:00<00:00, 724kB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/iabhishekofficial/mobile-price-classification/versions/1





In [3]:
!ls /root/.cache/kagglehub/datasets/iabhishekofficial/mobile-price-classification/versions/1

test.csv  train.csv


In [4]:
df = pd.read_csv("/root/.cache/kagglehub/datasets/iabhishekofficial/mobile-price-classification/versions/1/train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

Описание фичей

- **battery_power** - Общая энергия, которую аккумулятор может хранить за один раз, измеряется в мАч.
- **blue** - Есть Bluetooth или нет
- **clock_speed** - Скорость, с которой микропроцессор выполняет инструкции
- **dual_sim** - Имеет поддержку двух SIM-карт или нет
- **fc** - Мегапиксельная передняя камера
- **four_g** - Есть 4G или нет
- **int_memory** - Внутренняя память в гигабайтах
- **m_dep** - Глубина мобильного телефона в см
- **mobile_wt** - Вес мобильного телефона
- **n_cores** - Количество ядер процессора
- **pc** - Мегапиксели основной камеры
- **px_height** - Разрешение пикселей Высота
- **px_width** - Разрешение пикселей Ширина
- **ram** - Оперативная память в мегабайтах
- **sc_h** - Экран Высота мобильного телефона в см
- **sc_w** - Ширина экрана мобильного телефона в см
- **talk_time** - максимальное время, в течение которого будет работать один заряд батареи, когда вы находитесь
- **three_g** - Есть 3G или нет
- **touch_screen** - Есть сенсорный экран или нет
- **wifi** - Есть Wi-Fi или нет
- **price_range** - Это целевая переменная со значением 0 (низкая стоимость), 1 (средняя стоимость), 2 (высокая стоимость) и 3 (очень высокая стоимость).


В данной задаче необходимо классифицировать объект (телефон), который состоит их набора характеристик по трем классам price_range со значением 0 (низкая стоимость), 1 (средняя стоимость), 2 (высокая стоимость) и 3 (очень высокая стоимость).

Довольно актуальная задача для ценообразования и выбора хорошого телефона по характеристикам.



In [6]:
target_columns = 'price_range'

X = df.drop(target_columns, axis=1)
y = df[target_columns]

In [7]:
df.dtypes

Unnamed: 0,0
battery_power,int64
blue,int64
clock_speed,float64
dual_sim,int64
fc,int64
four_g,int64
int_memory,int64
m_dep,float64
mobile_wt,int64
n_cores,int64


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
y_train.value_counts()

Unnamed: 0_level_0,count
price_range,Unnamed: 1_level_1
1,400
0,400
3,400
2,400


In [10]:
y_test.value_counts()

Unnamed: 0_level_0,count
price_range,Unnamed: 1_level_1
3,100
1,100
0,100
2,100


Учитывая что все классы распределены равномерно и нет перевеса в пользу какого-то, то очевидным и верным решением будет использовать метрику **Accuracy**

# 2.	Создание бейзлайна и оценка качества

Дерево решений

In [11]:
pipeline_tree = Pipeline(steps=[
    ('decision_tree', DecisionTreeClassifier(random_state=12))
])

pipeline_tree.fit(X_train, y_train)
y_pred_log = pipeline_tree.predict(X_test)

print("=== Decision Tree Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))


=== Decision Tree Классификатор ===
Accuracy: 0.845
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.90       100
           1       0.80      0.77      0.79       100
           2       0.77      0.84      0.80       100
           3       0.93      0.86      0.90       100

    accuracy                           0.84       400
   macro avg       0.85      0.84      0.85       400
weighted avg       0.85      0.84      0.85       400



# 3.	Улучшение бейзлайна

Некоторые проверки были сделаны в ноутбуке с KNN

Добавим фичи в зависимости от других параметров:

Плотность пикселей
$$
ppi = \frac{\sqrt{px\_height^2 + px\_width^2}}{\sqrt{sc\_h^2 + sc\_w^2}}
$$

Эффективность батареи

$$
battery\_efficiency = \frac{battery\_power}{mobile\_wt}
$$

Размер экрана
$$
screen\_size = sc\_h \cdot sc\_w
$$

Оперативная на ядро
$$
ram\_per\_core = \frac{ram}{n\_cores}
$$

Общая память
$$
total\_memory = int\_memory + ram
$$



In [12]:
X_trainV2 = X_train.copy()
X_testV2 = X_test.copy()
X_trainV2["ppi"] = np.sqrt((X_train["px_height"]**2 + X_train["px_width"]**2)) / np.sqrt((X_train["sc_h"]**2 + X_train["sc_w"]**2))
X_trainV2["battery_efficiency"] = X_train["battery_power"] / X_train["mobile_wt"]
X_trainV2["screen_size"] = X_train["sc_h"] * X_train["sc_w"]
X_trainV2["ram_per_core"] = X_train["ram"] / X_train["n_cores"]
X_trainV2["total_memory"] = X_train["int_memory"] + X_train["ram"]

X_testV2["ppi"] = np.sqrt((X_test["px_height"]**2 + X_test["px_width"]**2)) / np.sqrt((X_test["sc_h"]**2 + X_test["sc_w"]**2))
X_testV2["battery_efficiency"] = X_test["battery_power"] / X_test["mobile_wt"]
X_testV2["screen_size"] = X_test["sc_h"] * X_test["sc_w"]
X_testV2["ram_per_core"] = X_test["ram"] / X_test["n_cores"]
X_testV2["total_memory"] = X_test["int_memory"] + X_test["ram"]

Статистика по фичам

In [13]:
numeric_stats = X_trainV2.select_dtypes(include=['number']).describe()

In [14]:
numeric_stats

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,sc_w,talk_time,three_g,touch_screen,wifi,ppi,battery_efficiency,screen_size,ram_per_core,total_memory
count,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,...,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0
mean,1236.665625,0.494375,1.529313,0.506875,4.3225,0.518125,32.063125,0.503563,140.1025,4.546875,...,5.77875,10.92125,0.761875,0.504375,0.5125,121.253076,9.445824,80.6375,708.905294,2157.070625
std,438.041695,0.500125,0.817696,0.500109,4.337799,0.499828,18.264382,0.289,35.283567,2.286607,...,4.385552,5.490231,0.426069,0.500137,0.5,71.23642,4.269225,77.607251,737.628733,1086.500705
min,502.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,2.0,0.0,0.0,0.0,22.211163,2.522613,0.0,32.25,262.0
25%,849.5,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,2.0,6.0,1.0,0.0,0.0,71.149698,6.147959,18.0,258.642857,1231.75
50%,1222.5,0.0,1.5,1.0,3.0,1.0,31.5,0.5,141.0,5.0,...,5.0,11.0,1.0,1.0,1.0,103.431749,8.84883,54.0,466.607143,2174.0
75%,1614.0,1.0,2.225,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,9.0,16.0,1.0,1.0,1.0,150.904562,11.889007,126.0,830.9375,3095.75
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,18.0,20.0,1.0,1.0,1.0,493.462804,24.675,342.0,3978.0,4040.0


Есть предположение, что если мы уберем малозначимые признаки относительно корреляции, то мы сможем улучшить качество

In [15]:
corr_df = pd.concat([X_trainV2, y_train], axis=1).corr()
bad_columns = list(corr_df[corr_df["price_range"].abs() < 0.04].index) + ["ram", "int_memory"] #"px_height", "px_width"]

In [16]:
X_train_filtered = X_trainV2.drop(columns=bad_columns)
X_test_filtered = X_testV2.drop(columns=bad_columns)

Обучим модель с улучшенным бейзлайном

In [17]:
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, list(X_train_filtered.columns)),
    ])


pipeline_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('decision_tree', DecisionTreeClassifier(random_state=12))
])

pipeline_tree.fit(X_train_filtered, y_train)
y_pred_decision_tree = pipeline_tree.predict(X_test_filtered)

print("=== Decision Tree Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_decision_tree))


=== Decision Tree Классификатор ===
Accuracy: 0.875
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.94       100
           1       0.86      0.84      0.85       100
           2       0.79      0.83      0.81       100
           3       0.91      0.89      0.90       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



Попробуем подобрать лучшие гиперпараметры

In [18]:
pipeline_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('decision_tree', DecisionTreeClassifier(random_state=12))
])

param_grid = {
    'decision_tree__criterion': ["gini", "entropy"],
    'decision_tree__max_depth': [None, 5, 7, 10, 20],
    'decision_tree__min_samples_split': [2, 5, 7, 10],
    'decision_tree__min_samples_leaf': [1, 2, 5, 10],
}


grid_search = GridSearchCV(pipeline_tree, param_grid, cv=4, scoring='accuracy')
grid_search.fit(X_train_filtered, y_train)

print("Лучшие параметры:", grid_search.best_params_)

best_decision_tree = grid_search.best_estimator_

y_pred_decision_tree = best_decision_tree.predict(X_test_filtered)

print("Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_decision_tree))


Лучшие параметры: {'decision_tree__criterion': 'entropy', 'decision_tree__max_depth': None, 'decision_tree__min_samples_leaf': 10, 'decision_tree__min_samples_split': 2}
Accuracy: 0.8825
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       100
           1       0.86      0.86      0.86       100
           2       0.79      0.81      0.80       100
           3       0.93      0.88      0.90       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



# 4. Своя имплементация DecisionTree

In [19]:
from sklearn.base import BaseEstimator, ClassifierMixin

class DecisionTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.tree = self._build_tree(X, y, depth=0)
        return self

    def predict(self, X):
        X = np.array(X)
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def predict_proba(self, X):
        X = np.array(X)
        return np.array([self._traverse_proba(x, self.tree) for x in X])

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        if num_samples < self.min_samples_split or (self.max_depth is not None and depth >= self.max_depth) or len(np.unique(y)) == 1:
            return self._create_leaf(y)

        best_split = self._find_best_split(X, y)
        if not best_split:
            return self._create_leaf(y)

        left_indices, right_indices = best_split['indices']
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {
            'feature': best_split['feature'],
            'threshold': best_split['threshold'],
            'left': left_tree,
            'right': right_tree
        }

    def _find_best_split(self, X, y):
        num_samples, num_features = X.shape
        best_gini = float('inf')
        best_split = None

        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = self._gini(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_split = {
                        'feature': feature,
                        'threshold': threshold,
                        'indices': (left_indices, right_indices)
                    }

        return best_split

    def _gini(self, left, right):
        def gini_impurity(y):
            classes, counts = np.unique(y, return_counts=True)
            prob = counts / len(y)
            return 1 - np.sum(prob ** 2)

        left_gini = gini_impurity(left) if len(left) > 0 else 0
        right_gini = gini_impurity(right) if len(right) > 0 else 0

        return (len(left) * left_gini + len(right) * right_gini) / (len(left) + len(right))

    def _create_leaf(self, y):
        if y.dtype.kind in {'i', 'u'}:  # Классификация
            classes, counts = np.unique(y, return_counts=True)
            probabilities = counts / counts.sum()
            return {'label': classes[np.argmax(probabilities)], 'probabilities': dict(zip(classes, probabilities))}
        else:  # Регрессия
            return {'label': np.mean(y)}

    def _traverse_tree(self, x, tree):
        if 'label' in tree:
            return tree['label']

        feature = tree['feature']
        threshold = tree['threshold']

        if x[feature] <= threshold:
            return self._traverse_tree(x, tree['left'])
        else:
            return self._traverse_tree(x, tree['right'])

    def _traverse_proba(self, x, tree):
        if 'probabilities' in tree:
            return tree['probabilities']
        feature = tree['feature']
        threshold = tree['threshold']
        if x[feature] <= threshold:
            return self._traverse_proba(x, tree['left'])
        else:
            return self._traverse_proba(x, tree['right'])

In [20]:
pipeline_tree = Pipeline(steps=[
    ('decision_tree', DecisionTree())
])

pipeline_tree.fit(X_train, y_train)
y_pred_decision_tree = pipeline_tree.predict(X_test)

print("=== Decision Tree Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_decision_tree))


=== Decision Tree Классификатор ===
Accuracy: 0.8425
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91       100
           1       0.82      0.74      0.78       100
           2       0.75      0.82      0.78       100
           3       0.92      0.87      0.89       100

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400



In [21]:
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, list(X_train_filtered.columns)),
    ])


pipeline_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('decision_tree', DecisionTree())
])

pipeline_tree.fit(X_train_filtered, y_train)
y_pred_decision_tree = pipeline_tree.predict(X_test_filtered)

print("=== Decision Tree Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_decision_tree))


=== Decision Tree Классификатор ===
Accuracy: 0.8475
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.92      0.92       100
           1       0.84      0.79      0.81       100
           2       0.76      0.82      0.79       100
           3       0.89      0.86      0.87       100

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



Моя имплементация схожа с результатами как и реализация в sklearn.

В ходе анализа данных можно понять что некоторые признаки влияют на целевую переменную больше чем остальные