In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Загрузка данных для классификации
classification_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/jobDs.csv")

# Предварительная обработка данных
classification_data = classification_data.dropna()  # Удаление строк с пропущенными значениями
X_text = classification_data['job_title']  # Признаки (текстовые данные)
y_class = classification_data['category']  # Целевая переменная

# Преобразование текстовых данных в числовые (Bag of Words)
vectorizer = CountVectorizer()
X_class = vectorizer.fit_transform(X_text)

# Разделение данных на обучающую и тестовую выборки
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)



In [None]:
regression_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/Delhi_v2.csv")
X_reg = regression_data.drop(columns=["price", "Address", "desc"])
y_reg = regression_data["price"]

# Преобразуем категориальные данные в числовые
X_reg = pd.get_dummies(X_reg, drop_first=True)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)


In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_class, y_train_class)

# Оценка качества
y_pred_class = dt_classifier.predict(X_test_class)
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f'Accuracy for classification: {accuracy:.4f}')

Accuracy for classification: 0.7923


In [None]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_reg, y_train_reg)


# Прогнозирование для регрессора
y_pred_reg = dt_regressor.predict(X_test_reg)

# Оценка качества
mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

# Вывод метрик
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"R^2: {r2:.4f}")


MAE: 230981.9121
MSE: 822028294573.6434
R^2: 0.9861


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Препроцессинг данных (масштабирование, замещение пропусков)
X_train_class_dense = X_train_class.toarray()  # Преобразуем разреженную матрицу в плотную
X_test_class_dense = X_test_class.toarray()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_class_dense)
X_test_scaled = scaler.transform(X_test_class_dense)

# Замещение пропусков (если есть)
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_class)
X_test_imputed = imputer.transform(X_test_class)

# Масштабирование данных
scaler = StandardScaler(with_mean=False)  # Без центрирования для разреженных матриц
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Подбор гиперпараметров через GridSearchCV с уменьшением диапазона
param_grid = {
    'n_estimators': [100, 150],  # Меньше деревьев
    'max_depth': [10, 15],  # Меньше значений max_depth
    'min_samples_split': [2, 4]  # Меньше значений min_samples_split
}

rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)  # Используем все ядра процессора для ускорения
grid_search_rf = GridSearchCV(rf_classifier, param_grid, cv=3, n_jobs=-1)  # Меньше кросс-валидаций для ускорения
grid_search_rf.fit(X_train_scaled, y_train_class)

# Лучшие гиперпараметры
print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Оценка качества модели на тестовых данных
y_pred_class_rf = grid_search_rf.best_estimator_.predict(X_test_scaled)
accuracy = accuracy_score(y_test_class, y_pred_class_rf)

print("Accuracy of the model:", accuracy)



Best parameters for Random Forest: {'max_depth': 15, 'min_samples_split': 4, 'n_estimators': 100}
Accuracy of the model: 0.6926994906621392


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Препроцессинг данных
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train_reg)
X_test_imputed = imputer.transform(X_test_reg)

# Подбор гиперпараметров
param_dist = {
    "max_depth": [3, 5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],  # Убрали "auto"
}

dt_regressor = DecisionTreeRegressor(random_state=42)
random_search = RandomizedSearchCV(
    dt_regressor,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    random_state=42,
    n_jobs=-1,
    error_score="raise"  # Прерываем при ошибках
)

# Обучение
random_search.fit(X_train_imputed, y_train_reg)

# Лучшие гиперпараметры
print("Лучшие параметры решающего дерева для регрессии:", random_search.best_params_)

# Оценка модели
best_dt_regressor = random_search.best_estimator_
y_pred = best_dt_regressor.predict(X_test_imputed)

mae = mean_absolute_error(y_test_reg, y_pred)
mse = mean_squared_error(y_test_reg, y_pred)
r2 = r2_score(y_test_reg, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R^2: {r2}")


Лучшие параметры решающего дерева для регрессии: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': None}
MAE: 268841.920963045
MSE: 961635486634.9545
R^2: 0.9837790886889036


Кастомный классификатор решающим деревом

In [None]:
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix

class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def fit(self, X, y):
        if isinstance(X, csr_matrix):
            X = X.toarray()  # Преобразуем разреженную матрицу в плотную
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        unique_classes = np.unique(y)

        # Прекращаем рекурсию, если все классы одинаковы, или максимальная глубина достигнута
        if len(unique_classes) == 1 or (self.max_depth and depth >= self.max_depth):
            return Counter(y).most_common(1)[0][0]

        # Ищем лучший раздел
        best_split = self._best_split(X, y)
        if best_split is None:
            return Counter(y).most_common(1)[0][0]

        left_tree = self._build_tree(X[best_split['left_indices']], y[best_split['left_indices']], depth + 1)
        right_tree = self._build_tree(X[best_split['right_indices']], y[best_split['right_indices']], depth + 1)
        return {'feature_index': best_split['feature_index'], 'threshold': best_split['threshold'], 'left': left_tree, 'right': right_tree}

    def _best_split(self, X, y):
        best_gini = float("inf")
        best_split = {}
        n_samples, n_features = X.shape

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = X[:, feature_index] <= threshold
                right_indices = X[:, feature_index] > threshold

                # Преобразуем индексы в булевы массивы для разреженных данных
                if isinstance(left_indices, csr_matrix):
                    left_indices = left_indices.toarray().astype(bool)
                if isinstance(right_indices, csr_matrix):
                    right_indices = right_indices.toarray().astype(bool)

                # Пропускаем маленькие разделы
                if np.sum(left_indices) < self.min_samples_leaf or np.sum(right_indices) < self.min_samples_leaf:
                    continue

                gini = self._gini_index(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_split = {'feature_index': feature_index, 'threshold': threshold, 'left_indices': left_indices, 'right_indices': right_indices}

        return best_split if best_gini != float("inf") else None

    def _gini_index(self, left_y, right_y):
        left_size = len(left_y)
        right_size = len(right_y)
        total_size = left_size + right_size
        left_prob = np.array([np.sum(left_y == c) / left_size for c in np.unique(left_y)])
        right_prob = np.array([np.sum(right_y == c) / right_size for c in np.unique(right_y)])
        gini_left = 1 - np.sum(left_prob ** 2)
        gini_right = 1 - np.sum(right_prob ** 2)
        return (left_size / total_size) * gini_left + (right_size / total_size) * gini_right

    def predict(self, X):
        if isinstance(X, csr_matrix):
            X = X.toarray()  # Преобразуем разреженную матрицу в плотную
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def _predict_sample(self, x, tree):
        if not isinstance(tree, dict):
            return tree

        if x[tree['feature_index']] <= tree['threshold']:
            return self._predict_sample(x, tree['left'])
        else:
            return self._predict_sample(x, tree['right'])

In [None]:
classifier = DecisionTreeClassifierCustom(max_depth=5, min_samples_split=10, min_samples_leaf=5)

# Обучение
classifier.fit(X_train_class, y_train_class)

# Прогнозирование
y_pred = classifier.predict(X_test_class)

# Оценка качества
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7923033389926429


Кастомная регрессия решающим дереовм

In [None]:
class DecisionTreeRegressorCustom:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        unique_values = np.unique(y)

        # Условие остановки
        if len(unique_values) == 1:
            return {'value': unique_values[0]}

        if depth >= self.max_depth or n_samples < self.min_samples_split:
            return {'value': np.mean(y)}

        # Поиск лучшего разбиения
        best_split = None
        best_score = float('inf')
        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = X[:, feature_index] <= threshold
                right_indices = ~left_indices
                left_y = y[left_indices]
                right_y = y[right_indices]

                if len(left_y) >= self.min_samples_leaf and len(right_y) >= self.min_samples_leaf:
                    gini = self._calculate_gini(left_y, right_y)
                    if gini < best_score:
                        best_score = gini
                        best_split = (feature_index, threshold)

        if best_split is None:
            return {'value': np.mean(y)}

        feature_index, threshold = best_split
        left_indices = X[:, feature_index] <= threshold
        right_indices = ~left_indices
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature_index': feature_index, 'threshold': threshold, 'left': left_tree, 'right': right_tree}

    def _calculate_gini(self, left_y, right_y):
        left_size = len(left_y)
        right_size = len(right_y)
        total_size = left_size + right_size
        left_gini = 1 - sum((np.sum(left_y == label) / left_size) ** 2 for label in np.unique(left_y))
        right_gini = 1 - sum((np.sum(right_y == label) / right_size) ** 2 for label in np.unique(right_y))
        return (left_size / total_size) * left_gini + (right_size / total_size) * right_gini

    def predict(self, X):
        predictions = [self._predict_sample(sample, self.tree) for sample in X]
        return np.array(predictions)

    def _predict_sample(self, sample, tree):
        if 'value' in tree:
            return tree['value']

        feature_value = sample[tree['feature_index']]
        if feature_value <= tree['threshold']:
            return self._predict_sample(sample, tree['left'])
        else:
            return self._predict_sample(sample, tree['right'])

In [None]:
regressor = DecisionTreeRegressorCustom(max_depth=5, min_samples_split=10, min_samples_leaf=5)
regressor.fit(X_train_reg.values, y_train_reg.values)

# Прогнозируем и оцениваем
y_pred = regressor.predict(X_test_reg.values)

# Оценка качества модели
mae = mean_absolute_error(y_test_reg, y_pred)
mse = mean_squared_error(y_test_reg, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")

  left_indices = X[:, feature_index] <= threshold


MAE: 1258357.4966011762
MSE: 11770383511829.137
