# 1.	Выбор начальных условий

In [1]:
import kagglehub
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import warnings

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("iabhishekofficial/mobile-price-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/iabhishekofficial/mobile-price-classification?dataset_version_number=1...


100%|██████████| 70.6k/70.6k [00:00<00:00, 47.1MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/iabhishekofficial/mobile-price-classification/versions/1





In [3]:
!ls /root/.cache/kagglehub/datasets/iabhishekofficial/mobile-price-classification/versions/1

test.csv  train.csv


In [2]:
df = pd.read_csv("/Users/22207865/.cache/kagglehub/datasets/iabhishekofficial/mobile-price-classification/versions/1/train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

Описание фичей

- **battery_power** - Общая энергия, которую аккумулятор может хранить за один раз, измеряется в мАч.
- **blue** - Есть Bluetooth или нет
- **clock_speed** - Скорость, с которой микропроцессор выполняет инструкции
- **dual_sim** - Имеет поддержку двух SIM-карт или нет
- **fc** - Мегапиксельная передняя камера
- **four_g** - Есть 4G или нет
- **int_memory** - Внутренняя память в гигабайтах
- **m_dep** - Глубина мобильного телефона в см
- **mobile_wt** - Вес мобильного телефона
- **n_cores** - Количество ядер процессора
- **pc** - Мегапиксели основной камеры
- **px_height** - Разрешение пикселей Высота
- **px_width** - Разрешение пикселей Ширина
- **ram** - Оперативная память в мегабайтах
- **sc_h** - Экран Высота мобильного телефона в см
- **sc_w** - Ширина экрана мобильного телефона в см
- **talk_time** - максимальное время, в течение которого будет работать один заряд батареи, когда вы находитесь
- **three_g** - Есть 3G или нет
- **touch_screen** - Есть сенсорный экран или нет
- **wifi** - Есть Wi-Fi или нет
- **price_range** - Это целевая переменная со значением 0 (низкая стоимость), 1 (средняя стоимость), 2 (высокая стоимость) и 3 (очень высокая стоимость).


В данной задаче необходимо классифицировать объект (телефон), который состоит их набора характеристик по трем классам price_range со значением 0 (низкая стоимость), 1 (средняя стоимость), 2 (высокая стоимость) и 3 (очень высокая стоимость).

Довольно актуальная задача для ценообразования и выбора хорошого телефона по характеристикам.



In [4]:
target_columns = 'price_range'

X = df.drop(target_columns, axis=1)
y = df[target_columns]

In [5]:
df.dtypes

battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
y_train.value_counts()

price_range
1    400
0    400
3    400
2    400
Name: count, dtype: int64

In [8]:
y_test.value_counts()

price_range
3    100
1    100
0    100
2    100
Name: count, dtype: int64

Учитывая что все классы распределены равномерно и нет перевеса в пользу какого-то, то очевидным и верным решением будет использовать метрику **Accuracy**

# 2.	Создание бейзлайна и оценка качества

Random Forest

In [9]:
pipeline_random_forest = Pipeline(steps=[
    ('random_forest', RandomForestClassifier(random_state=12))
])

pipeline_random_forest.fit(X_train, y_train)
y_pred_random_forest = pipeline_random_forest.predict(X_test)

print("=== Random Forest Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("Classification Report:\n", classification_report(y_test, y_pred_random_forest))


=== Random Forest Классификатор ===
Accuracy: 0.8975
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       100
           1       0.87      0.84      0.85       100
           2       0.82      0.86      0.84       100
           3       0.96      0.92      0.94       100

    accuracy                           0.90       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.90      0.90      0.90       400



# 3.	Улучшение бейзлайна

Некоторые проверки были сделаны в ноутбуке с KNN

Добавим фичи в зависимости от других параметров:

Плотность пикселей
$$
ppi = \frac{\sqrt{px\_height^2 + px\_width^2}}{\sqrt{sc\_h^2 + sc\_w^2}}
$$

Эффективность батареи

$$
battery\_efficiency = \frac{battery\_power}{mobile\_wt}
$$

Размер экрана
$$
screen\_size = sc\_h \cdot sc\_w
$$

Оперативная на ядро
$$
ram\_per\_core = \frac{ram}{n\_cores}
$$

Общая память
$$
total\_memory = int\_memory + ram
$$



In [10]:
X_trainV2 = X_train.copy()
X_testV2 = X_test.copy()
X_trainV2["ppi"] = np.sqrt((X_train["px_height"]**2 + X_train["px_width"]**2)) / np.sqrt((X_train["sc_h"]**2 + X_train["sc_w"]**2))
X_trainV2["battery_efficiency"] = X_train["battery_power"] / X_train["mobile_wt"]
X_trainV2["screen_size"] = X_train["sc_h"] * X_train["sc_w"]
X_trainV2["ram_per_core"] = X_train["ram"] / X_train["n_cores"]
X_trainV2["total_memory"] = X_train["int_memory"] + X_train["ram"]

X_testV2["ppi"] = np.sqrt((X_test["px_height"]**2 + X_test["px_width"]**2)) / np.sqrt((X_test["sc_h"]**2 + X_test["sc_w"]**2))
X_testV2["battery_efficiency"] = X_test["battery_power"] / X_test["mobile_wt"]
X_testV2["screen_size"] = X_test["sc_h"] * X_test["sc_w"]
X_testV2["ram_per_core"] = X_test["ram"] / X_test["n_cores"]
X_testV2["total_memory"] = X_test["int_memory"] + X_test["ram"]

Статистика по фичам

In [11]:
numeric_stats = X_trainV2.select_dtypes(include=['number']).describe()

In [12]:
numeric_stats

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,sc_w,talk_time,three_g,touch_screen,wifi,ppi,battery_efficiency,screen_size,ram_per_core,total_memory
count,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,...,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0
mean,1236.665625,0.494375,1.529313,0.506875,4.3225,0.518125,32.063125,0.503563,140.1025,4.546875,...,5.77875,10.92125,0.761875,0.504375,0.5125,121.253076,9.445824,80.6375,708.905294,2157.070625
std,438.041695,0.500125,0.817696,0.500109,4.337799,0.499828,18.264382,0.289,35.283567,2.286607,...,4.385552,5.490231,0.426069,0.500137,0.5,71.23642,4.269225,77.607251,737.628733,1086.500705
min,502.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,2.0,0.0,0.0,0.0,22.211163,2.522613,0.0,32.25,262.0
25%,849.5,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,2.0,6.0,1.0,0.0,0.0,71.149698,6.147959,18.0,258.642857,1231.75
50%,1222.5,0.0,1.5,1.0,3.0,1.0,31.5,0.5,141.0,5.0,...,5.0,11.0,1.0,1.0,1.0,103.431749,8.84883,54.0,466.607143,2174.0
75%,1614.0,1.0,2.225,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,9.0,16.0,1.0,1.0,1.0,150.904562,11.889007,126.0,830.9375,3095.75
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,18.0,20.0,1.0,1.0,1.0,493.462804,24.675,342.0,3978.0,4040.0


Есть предположение, что если мы уберем малозначимые признаки относительно корреляции, то мы сможем улучшить качество

In [59]:
corr_df = pd.concat([X_trainV2, y_train], axis=1).corr()
bad_columns = list(corr_df[corr_df["price_range"].abs() < 0.04].index) + \
["ram", "int_memory", "ppi", "battery_efficiency", "sc_w", "pc", "screen_size", "mobile_wt"] #"px_height", "px_width"]

In [60]:
X_train_filtered = X_trainV2.drop(columns=bad_columns)
X_test_filtered = X_testV2.drop(columns=bad_columns)

Обучим модель с улучшенным бейзлайном

In [61]:
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, list(X_train_filtered.columns)),
    ])


pipeline_random_forest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestClassifier(random_state=12))
])

pipeline_random_forest.fit(X_train_filtered, y_train)
y_pred_random_forest = pipeline_random_forest.predict(X_test_filtered)

print("=== Random forest Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("Classification Report:\n", classification_report(y_test, y_pred_random_forest))


y_train_pred = pipeline_random_forest.predict(X_train_filtered)
print("=== Производительность на тренировочных данных ===")
print("Accuracy (Train):", accuracy_score(y_train, y_train_pred))
print("Classification Report (Train):\n", classification_report(y_train, y_train_pred))


=== Random forest Классификатор ===
Accuracy: 0.94
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       100
           1       0.91      0.90      0.90       100
           2       0.90      0.90      0.90       100
           3       0.98      0.97      0.97       100

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400

=== Производительность на тренировочных данных ===
Accuracy (Train): 1.0
Classification Report (Train):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       1.00      1.00      1.00       400
           2       1.00      1.00      1.00       400
           3       1.00      1.00      1.00       400

    accuracy                           1.00      1600
   macro avg       1.00      1.00      1.00      1600
weigh

Попробуем подобрать лучшие гиперпараметры

In [62]:
from sklearn.model_selection import train_test_split, GridSearchCV

pipeline_random_forest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestClassifier(random_state=12))
])

param_grid = {
    'random_forest__n_estimators': [5, 10, 50, 100],
    'random_forest__max_depth': [None, 5, 10, 20, 50],
    'random_forest__min_samples_split': [2, 5, 7, 10],
    'random_forest__min_samples_leaf': [1, 2, 4, 8],
}


grid_search = GridSearchCV(pipeline_random_forest, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_filtered, y_train)

print("Лучшие параметры:", grid_search.best_params_)

best_random_forest = grid_search.best_estimator_

y_pred_random_forest = best_random_forest.predict(X_test_filtered)

print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("Classification Report:\n", classification_report(y_test, y_pred_random_forest))


Лучшие параметры: {'random_forest__max_depth': None, 'random_forest__min_samples_leaf': 1, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 100}
Accuracy: 0.94
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       100
           1       0.91      0.90      0.90       100
           2       0.90      0.90      0.90       100
           3       0.98      0.97      0.97       100

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400



# 4. Своя имплементация RandomForestClassifier

In [63]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

from implementations import RandomForest

In [64]:
pipeline_random_forest = Pipeline(steps=[
    ('random_forest', RandomForest(random_state=12))
])

pipeline_random_forest.fit(X_train.to_numpy(), y_train)
y_pred_random_forest = pipeline_random_forest.predict(X_test.to_numpy())

print("=== Random forest Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("Classification Report:\n", classification_report(y_test, y_pred_random_forest))


y_train_pred = pipeline_random_forest.predict(X_train.to_numpy())
print("=== Производительность на тренировочных данных ===")
print("Accuracy (Train):", accuracy_score(y_train, y_train_pred))
print("Classification Report (Train):\n", classification_report(y_train, y_train_pred))

=== Random forest Классификатор ===
Accuracy: 0.77
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       100
           1       0.70      0.70      0.70       100
           2       0.65      0.66      0.66       100
           3       0.88      0.83      0.86       100

    accuracy                           0.77       400
   macro avg       0.77      0.77      0.77       400
weighted avg       0.77      0.77      0.77       400

=== Производительность на тренировочных данных ===
Accuracy (Train): 1.0
Classification Report (Train):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       1.00      1.00      1.00       400
           2       1.00      1.00      1.00       400
           3       1.00      1.00      1.00       400

    accuracy                           1.00      1600
   macro avg       1.00      1.00      1.00      1600
weigh

In [72]:
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, list(X_train_filtered.columns)),
    ])


pipeline_random_forest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForest(max_depth=4, random_state=12, n_estimators=25, min_samples_split=2))
])

pipeline_random_forest.fit(X_train_filtered, y_train)
y_pred_random_forest = pipeline_random_forest.predict(X_test_filtered)

print("=== Random forest Классификатор ===")
print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("Classification Report:\n", classification_report(y_test, y_pred_random_forest))


y_train_pred = pipeline_random_forest.predict(X_train_filtered)
print("=== Производительность на тренировочных данных ===")
print("Accuracy (Train):", accuracy_score(y_train, y_train_pred))
print("Classification Report (Train):\n", classification_report(y_train, y_train_pred))


=== Random forest Классификатор ===
Accuracy: 0.805
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91       100
           1       0.72      0.74      0.73       100
           2       0.70      0.71      0.70       100
           3       0.90      0.87      0.88       100

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

=== Производительность на тренировочных данных ===
Accuracy (Train): 0.8375
Classification Report (Train):
               precision    recall  f1-score   support

           0       0.91      0.88      0.90       400
           1       0.77      0.80      0.79       400
           2       0.78      0.78      0.78       400
           3       0.89      0.89      0.89       400

    accuracy                           0.84      1600
   macro avg       0.84      0.84      0.84      1600
w

Моя имплементация схожа с результатами как и реализация в sklearn.

В ходе анализа данных можно понять что некоторые признаки влияют на целевую переменную больше чем остальные