In [None]:
# 1. Baseline Implementation
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
import pandas as pd
# Load and preprocess classification data
classification_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/jobDs.csv")
classification_data = classification_data.dropna()
X_text = classification_data['job_title']
y_class = classification_data['category']

# Transform text data to numeric (Bag of Words)
vectorizer = CountVectorizer()
X_class = vectorizer.fit_transform(X_text)

# Split data
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

In [None]:
# Load and preprocess regression data
regression_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/Delhi_v2.csv")
X_reg = regression_data.drop(columns=["price", "Address", "desc"])
y_reg = regression_data["price"]

# Handle missing values only for numeric columns
numeric_columns = X_reg.select_dtypes(include=['float64', 'int64']).columns
X_reg[numeric_columns] = SimpleImputer(strategy='mean').fit_transform(X_reg[numeric_columns])

# One-hot encode categorical data (for non-numeric columns)
X_reg = pd.get_dummies(X_reg, drop_first=True)

# Now, continue with train-test split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)


In [None]:
# Classification Baseline
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train_class, y_train_class)
y_pred_class = clf.predict(X_test_class)

print("Classification Baseline:")
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))
print(classification_report(y_test_class, y_pred_class))

Classification Baseline:
Accuracy: 0.8132427843803056
                                        precision    recall  f1-score   support

                            Accounting       0.50      0.22      0.31         9
       Administration & Office Support       0.79      0.89      0.84       436
             Advertising, Arts & Media       0.33      0.08      0.13        12
          Banking & Financial Services       0.68      0.75      0.71       208
              CEO & General Management       0.70      0.70      0.70        10
        Call Centre & Customer Service       0.62      0.37      0.46        35
                          Construction       0.85      0.78      0.81        85
                 Consulting & Strategy       0.56      0.42      0.48        24
                 Design & Architecture       0.76      0.76      0.76        17
                           Engineering       0.25      0.33      0.29         3
                  Healthcare & Medical       0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Регресс

In [None]:
# Regression Baseline
reg = GradientBoostingRegressor(random_state=42)
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

print("Regression Baseline:")
print("Mean Squared Error:", mean_squared_error(y_test_reg, y_pred_reg))
print("Mean Absolute Error:", mean_absolute_error(y_test_reg, y_pred_reg))
print("R2 Score:", r2_score(y_test_reg, y_pred_reg))

Regression Baseline:
Mean Squared Error: 402883666920.3741
Mean Absolute Error: 270645.76738881896
R2 Score: 0.9932041399047439


Улучшение классификатора

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Использование TfidfVectorizer для ускорения работы и повышения точности
print("\nImproving Classification Baseline:")
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Ограничиваем количество признаков
X_class_tfidf = tfidf_vectorizer.fit_transform(X_text)

X_class_tfidf_dense = X_class_tfidf.toarray()

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class_tfidf_dense, y_class, test_size=0.2, random_state=42
)

param_dist_class = {
    'max_iter': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5]
}

clf_rs = RandomizedSearchCV(HistGradientBoostingClassifier(random_state=42),
                            param_distributions=param_dist_class, n_iter=5, cv=2, random_state=42, n_jobs=-1)
clf_rs.fit(X_train_class, y_train_class)

print("Best Parameters for Classification:", clf_rs.best_params_)
y_pred_class = clf_rs.predict(X_test_class)
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))
print(classification_report(y_test_class, y_pred_class))



Improving Classification Baseline:




Best Parameters for Classification: {'max_iter': 100, 'max_depth': 3, 'learning_rate': 0.05}
Accuracy: 0.7775891341256367
                                        precision    recall  f1-score   support

                            Accounting       0.33      0.11      0.17         9
       Administration & Office Support       0.74      0.88      0.81       436
             Advertising, Arts & Media       0.00      0.00      0.00        12
          Banking & Financial Services       0.60      0.75      0.67       208
              CEO & General Management       0.67      0.60      0.63        10
        Call Centre & Customer Service       0.53      0.51      0.52        35
                          Construction       0.87      0.69      0.77        85
                 Consulting & Strategy       0.67      0.17      0.27        24
                 Design & Architecture       0.38      0.18      0.24        17
                           Engineering       0.50      0.33      0.40        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Улучшение реггрессора

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

print("\nImproving Regression Baseline:")

# Масштабирование данных
scaler = StandardScaler()
X_train_reg_scaled = scaler.fit_transform(X_train_reg)
X_test_reg_scaled = scaler.transform(X_test_reg)

# Сокращаем параметры для RandomizedSearchCV
param_dist_reg = {
    'n_estimators': [50, 100],  # Меньше количество деревьев
    'learning_rate': [0.05, 0.1],  # Меньше вариантов для learning_rate
    'max_depth': [3, 5]  # Меньше вариантов для max_depth
}

# Используем RandomizedSearchCV для ускорения поиска
reg_rs = RandomizedSearchCV(GradientBoostingRegressor(random_state=42),
                             param_distributions=param_dist_reg, n_iter=5, cv=2, random_state=42, n_jobs=-1)
reg_rs.fit(X_train_reg_scaled, y_train_reg)

print("Best Parameters for Regression:", reg_rs.best_params_)
y_pred_reg = reg_rs.predict(X_test_reg_scaled)

print("Mean Squared Error:", mean_squared_error(y_test_reg, y_pred_reg))
print("R2 Score:", r2_score(y_test_reg, y_pred_reg))



Improving Regression Baseline:


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for Regression: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}
Mean Squared Error: 407908432921.0759
R2 Score: 0.9931193819223387


In [None]:
class CustomGradientBoosting:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.init_prediction = None

    def fit(self, X, y):
        self.init_prediction = np.mean(y)
        residuals = y - self.init_prediction

        for _ in range(self.n_estimators):
            model = GradientBoostingRegressor(
                n_estimators=1, learning_rate=self.learning_rate, max_depth=self.max_depth, random_state=42
            )
            model.fit(X, residuals)
            self.models.append(model)
            residuals -= self.learning_rate * model.predict(X)

    def predict(self, X):
        predictions = np.full(X.shape[0], self.init_prediction)
        for model in self.models:
            predictions += self.learning_rate * model.predict(X)
        return predictions

In [None]:
# Train and evaluate custom gradient boosting for regression
custom_gb = CustomGradientBoosting(n_estimators=50, learning_rate=0.1, max_depth=3)
custom_gb.fit(X_train_reg, y_train_reg)
y_pred_custom = custom_gb.predict(X_test_reg)

print("Custom Gradient Boosting for Regression:")
print("Mean Squared Error:", mean_squared_error(y_test_reg, y_pred_custom))
print("Mean Absolute Error:", mean_absolute_error(y_test_reg, y_pred_custom))
print("R2 Score:", r2_score(y_test_reg, y_pred_custom))

Custom Gradient Boosting for Regression:
Mean Squared Error: 27474434588463.695
Mean Absolute Error: 3039431.512257809
R2 Score: 0.5365599824716487


In [None]:
from sklearn.preprocessing import LabelEncoder

# Обработка целевых меток для классификации
label_encoder = LabelEncoder()
y_class_encoded = label_encoder.fit_transform(y_class)

# Разделение данных на тренировочные и тестовые
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class_encoded, test_size=0.2, random_state=42
)

# Обучение и оценка пользовательской модели градиентного бустинга
custom_gb_clf = CustomGradientBoosting(n_estimators=50, learning_rate=0.1, max_depth=3)
custom_gb_clf.fit(X_train_class.toarray(), y_train_class.astype(float))
y_pred_custom_class = custom_gb_clf.predict(X_test_class.toarray())

# Преобразование прогнозов в метки классов
threshold = 0.5
custom_class_labels = np.where(y_pred_custom_class > threshold, 1, 0)

print("Custom Gradient Boosting for Classification:")
print("Accuracy:", accuracy_score(y_test_class, custom_class_labels))
print(classification_report(y_test_class, custom_class_labels))


Custom Gradient Boosting for Classification:
Accuracy: 0.24674589700056593
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.25      1.00      0.40       436
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00       208
           4       0.00      0.00      0.00        10
           5       0.00      0.00      0.00        35
           6       0.00      0.00      0.00        85
           7       0.00      0.00      0.00        24
           8       0.00      0.00      0.00        17
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         3
          13       0.00      0.00      0.00       366
          14       0.00      0.00      0.00       149
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00        25
          17       0.00      0.00      0.00       165
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
