In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np

# === 1. Создание бейзлайна ===
# Классификация
classification_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/jobDs.csv")
classification_data = classification_data.dropna()
X_text = classification_data['job_title']
y_class = classification_data['category']

vectorizer = CountVectorizer()
X_class = vectorizer.fit_transform(X_text)

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

In [None]:
regression_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/Delhi_v2.csv")
X_reg = regression_data.drop(columns=["price", "Address", "desc"])
y_reg = regression_data["price"]
X_reg = pd.get_dummies(X_reg, drop_first=True)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

In [None]:
# Модель случайного леса для классификации
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_class, y_train_class)
y_pred_class = clf.predict(X_test_class)

print("=== Классификация: Бейзлайн ===")
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))
print(classification_report(y_test_class, y_pred_class))

=== Классификация: Бейзлайн ===
Accuracy: 0.8058856819468024
                                        precision    recall  f1-score   support

                            Accounting       0.40      0.22      0.29         9
       Administration & Office Support       0.81      0.85      0.83       436
             Advertising, Arts & Media       0.00      0.00      0.00        12
          Banking & Financial Services       0.69      0.76      0.72       208
              CEO & General Management       0.67      0.60      0.63        10
        Call Centre & Customer Service       0.55      0.51      0.53        35
                          Construction       0.76      0.76      0.76        85
                 Consulting & Strategy       0.35      0.25      0.29        24
                 Design & Architecture       0.80      0.47      0.59        17
                           Engineering       0.50      0.33      0.40         3
                  Healthcare & Medical       0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Модель случайного леса для регрессии
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

print("\n=== Регрессия: Бейзлайн ===")
print("MSE:", mean_squared_error(y_test_reg, y_pred_reg))
print("R2 Score:", r2_score(y_test_reg, y_pred_reg))


=== Регрессия: Бейзлайн ===
MSE: 408716011860.4651
R2 Score: 0.993105759643903


In [None]:
# === 2. Улучшение бейзлайна ===
# Гипотезы: подбор гиперпараметров
param_grid_clf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

param_grid_reg = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

In [None]:
# Классификация
grid_clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_clf, cv=3, scoring='accuracy')
grid_clf.fit(X_train_class, y_train_class)

best_clf = grid_clf.best_estimator_
y_pred_class_tuned = best_clf.predict(X_test_class)

print("\n=== Классификация: Улучшенный бейзлайн ===")
print("Лучшие параметры:", grid_clf.best_params_)
print("Accuracy:", accuracy_score(y_test_class, y_pred_class_tuned))
print(classification_report(y_test_class, y_pred_class_tuned))





=== Классификация: Улучшенный бейзлайн ===
Лучшие параметры: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 150}
Accuracy: 0.8126768534238823
                                        precision    recall  f1-score   support

                            Accounting       0.67      0.22      0.33         9
       Administration & Office Support       0.80      0.87      0.83       436
             Advertising, Arts & Media       0.00      0.00      0.00        12
          Banking & Financial Services       0.71      0.77      0.74       208
              CEO & General Management       0.56      0.50      0.53        10
        Call Centre & Customer Service       0.54      0.57      0.56        35
                          Construction       0.78      0.78      0.78        85
                 Consulting & Strategy       0.29      0.17      0.21        24
                 Design & Architecture       0.83      0.59      0.69        17
                           Engineering    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
random_reg = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_grid_reg,
    n_iter=5,  # Уменьшает число комбинаций
    cv=3,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)
random_reg.fit(X_train_reg, y_train_reg)

best_reg = random_reg.best_estimator_
y_pred_reg_tuned = best_reg.predict(X_test_reg)

print("\n=== Регрессия: Улучшенный бейзлайн ===")
print("Лучшие параметры:", random_reg.best_params_)
print("MSE:", mean_squared_error(y_test_reg, y_pred_reg_tuned))
print("R2 Score:", r2_score(y_test_reg, y_pred_reg_tuned))



=== Регрессия: Улучшенный бейзлайн с RandomizedSearch ===
Лучшие параметры: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': 20}
MSE: 497947779992.0194
R2 Score: 0.9916005941034142


In [None]:
def manual_random_forest(X, y, n_estimators=10, max_depth=None):
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

    trees = []
    for _ in range(n_estimators):
        bootstrap_indices = np.random.choice(range(len(y)), size=len(y), replace=True)
        X_bootstrap = X[bootstrap_indices]
        y_bootstrap = y[bootstrap_indices]

        tree = DecisionTreeClassifier(max_depth=max_depth) if isinstance(y[0], str) else DecisionTreeRegressor(max_depth=max_depth)
        tree.fit(X_bootstrap, y_bootstrap)
        trees.append(tree)

    return trees

Применение самописного случайного леса для классификации

In [None]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(y_train_class))}
inv_class_mapping = {idx: label for label, idx in class_mapping.items()}
y_train_class_num = np.array([class_mapping[label] for label in y_train_class])

manual_forest_class = manual_random_forest(X_train_class.toarray(), y_train_class_num, n_estimators=10, max_depth=10)

# Прогноз на тестовых данных
manual_preds_class = np.mean([tree.predict(X_test_class.toarray()) for tree in manual_forest_class], axis=0)
man_pred_class_labels = [inv_class_mapping[int(round(pred))] for pred in manual_preds_class]

print("\n=== Классификация: Самописный случайный лес ===")
print("Accuracy:", accuracy_score(y_test_class, man_pred_class_labels))


=== Классификация: Самописный случайный лес ===
Accuracy: 0.3503112620260328


Применение самописного случайного леса для регрессии

In [None]:
manual_forest_reg = manual_random_forest(X_train_reg.to_numpy(), y_train_reg.to_numpy(), n_estimators=10, max_depth=10)

# Прогноз на тестовых данных
manual_preds_reg = np.mean([tree.predict(X_test_reg.to_numpy()) for tree in manual_forest_reg], axis=0)

print("\n=== Регрессия: Самописный случайный лес ===")
print("MSE:", mean_squared_error(y_test_reg, manual_preds_reg))
print("R2 Score:", r2_score(y_test_reg, manual_preds_reg))



=== Регрессия: Самописный случайный лес ===
MSE: 494091588682.0969
R2 Score: 0.9916656405145607
