In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Загрузка данных для регрессии
regression_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/Delhi_v2.csv")

# Удаление ненужных столбцов
X_reg = regression_data.drop(columns=["price", "Address", "desc"])  # Удаляем целевую переменную и текстовые столбцы
y_reg = regression_data["price"]  # Целевая переменная

# Преобразуем категориальные данные в числовые
X_reg = pd.get_dummies(X_reg, drop_first=True)

# Обработка пропущенных значений
imputer = SimpleImputer(strategy="mean")  # Замена NaN на среднее значение
X_reg = imputer.fit_transform(X_reg)

# Разделение данных на обучающую и тестовую выборки
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Загрузка данных для классификации
classification_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/jobDs.csv")

# Предварительная обработка данных
classification_data = classification_data.dropna()  # Удаление строк с пропущенными значениями
X_text = classification_data['job_title']  # Признаки (текстовые данные)
y_class = classification_data['category']  # Целевая переменная

# Преобразование текстовых данных в числовые (Bag of Words)
vectorizer = CountVectorizer()
X_class = vectorizer.fit_transform(X_text)

# Разделение данных на обучающую и тестовую выборки
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Обучение линейной модели классификации
log_reg = LogisticRegression(max_iter=5, random_state=42)
log_reg.fit(X_train_class, y_train_class)

# Предсказания
y_pred_class = log_reg.predict(X_test_class)

# Оценка качества
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))
print(classification_report(y_test_class, y_pred_class))


Accuracy: 0.6644029428409735
                                        precision    recall  f1-score   support

                            Accounting       0.00      0.00      0.00         9
       Administration & Office Support       0.66      0.89      0.76       436
             Advertising, Arts & Media       0.00      0.00      0.00        12
          Banking & Financial Services       0.36      0.71      0.48       208
              CEO & General Management       0.00      0.00      0.00        10
        Call Centre & Customer Service       0.00      0.00      0.00        35
                          Construction       0.73      0.51      0.60        85
                 Consulting & Strategy       0.00      0.00      0.00        24
                 Design & Architecture       0.00      0.00      0.00        17
                           Engineering       0.00      0.00      0.00         3
                  Healthcare & Medical       0.00      0.00      0.00         3
         H

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Обучение линейной модели регрессии
lin_reg = LinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)

# Предсказания
y_pred_reg = lin_reg.predict(X_test_reg)

# Оценка качества
print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("MSE:", mean_squared_error(y_test_reg, y_pred_reg))
print("R^2:", r2_score(y_test_reg, y_pred_reg))


MAE: 730607341542.4445
MSE: 7.44655400559215e+26
R^2: -12560881308641.113


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Параметры
pipeline_class = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # Отключаем центрирование
    ('log_reg', LogisticRegression(max_iter=5, random_state=42, n_jobs=-1))  # Параллельная обработка
])

# Подбор гиперпараметров
param_distributions = {
    'log_reg__C': loguniform(0.01, 100),  # Логарифмическое распределение
    'log_reg__solver': ['liblinear', 'saga'],  # Решатели
    'log_reg__penalty': ['l2', 'elasticnet'],  # Виды регуляризации
    'log_reg__max_iter': [1, 2, 3]  # Большее число итераций
}

# Разбиение
skf = StratifiedKFold(n_splits=3)

# Поиск
random_search_class = RandomizedSearchCV(
    pipeline_class,
    param_distributions=param_distributions,
    n_iter=5,  # Увеличьте количество итераций
    cv=skf,
    scoring='accuracy',  # Для точности
    random_state=42,
    n_jobs=-1  # Использование всех ядер
)

# Обучение и предсказания
random_search_class.fit(X_train_class, y_train_class)
y_pred_class_improved = random_search_class.best_estimator_.predict(X_test_class)

# Оценка
print("Improved Accuracy:", accuracy_score(y_test_class, y_pred_class_improved))
print(classification_report(y_test_class, y_pred_class_improved))

6 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kw

Improved Accuracy: 0.8160724391624222
                                        precision    recall  f1-score   support

                            Accounting       0.00      0.00      0.00         9
       Administration & Office Support       0.83      0.86      0.85       436
             Advertising, Arts & Media       0.50      0.08      0.14        12
          Banking & Financial Services       0.81      0.78      0.80       208
              CEO & General Management       0.80      0.40      0.53        10
        Call Centre & Customer Service       0.50      0.54      0.52        35
                          Construction       0.81      0.80      0.80        85
                 Consulting & Strategy       0.47      0.29      0.36        24
                 Design & Architecture       0.78      0.82      0.80        17
                           Engineering       0.50      0.33      0.40         3
                  Healthcare & Medical       0.00      0.00      0.00         3
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Загрузка данных для регрессии
regression_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AiDatasets/Delhi_v2.csv")

# Удаление ненужных столбцов
X_reg = regression_data.drop(columns=["price", "Address", "desc"])  # Удаляем целевую переменную и текстовые столбцы
y_reg = regression_data["price"]  # Целевая переменная

# Преобразуем категориальные данные в числовые
X_reg = pd.get_dummies(X_reg, drop_first=True)

# Обработка пропущенных значений
imputer = SimpleImputer(strategy="mean")  # Замена NaN на среднее значение
X_reg = imputer.fit_transform(X_reg)

# Разделение данных на обучающую и тестовую выборки
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Создание пайплайна для масштабирования данных и применения модели
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Масштабирование данных
    ('regressor', LinearRegression())  # Линейная регрессия
])

# Настройка гиперпараметров для линейной регрессии (пока без изменений)
param_grid = {
    'regressor': [LinearRegression(), RandomForestRegressor(n_estimators=100, random_state=42)]  # Пробуем линейную регрессию и случайный лес
}

# Поиск по сетке с кросс-валидацией для выбора лучшего алгоритма
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели
grid_search.fit(X_train_reg, y_train_reg)

# Предсказания
y_pred_reg = grid_search.best_estimator_.predict(X_test_reg)

# Оценка качества модели
print("Best Model:", grid_search.best_estimator_)
print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("MSE:", mean_squared_error(y_test_reg, y_pred_reg))
print("R^2:", r2_score(y_test_reg, y_pred_reg))



1 fits failed out of a total of 6.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwa

Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor', RandomForestRegressor(random_state=42))])
MAE: 130127.39018087856
MSE: 363042385555.55554
R^2: 0.9938761844585494


In [None]:
class CustomLinearClassifier:
    def __init__(self, lr=0.01, n_iter=1000):
        self.lr = lr
        self.n_iter = n_iter

    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        for _ in range(self.n_iter):
            gradients = -np.dot(X.T, (y - self.predict_proba(X))) / len(y)
            self.theta -= self.lr * gradients

    def predict_proba(self, X):
        return 1 / (1 + np.exp(-np.dot(X, self.theta)))

    def predict(self, X):
        return (self.predict_proba(X) > 0.5).astype(int)


In [None]:
class CustomLinearRegressor:
    def __init__(self, lr=0.01, n_iter=1000):
        self.lr = lr
        self.n_iter = n_iter

    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        for _ in range(self.n_iter):
            gradients = -2 * np.dot(X.T, (y - self.predict(X))) / len(y)
            self.theta -= self.lr * gradients

    def predict(self, X):
        return np.dot(X, self.theta)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Преобразование строковых меток в числовые
label_encoder = LabelEncoder()
y_train_class = label_encoder.fit_transform(y_train_class)  # Преобразуем метки в числовые значения
y_test_class = label_encoder.transform(y_test_class)

# Применение кастомного классификатора
custom_classifier = CustomLinearClassifier(lr=0.01, n_iter=5)
custom_classifier.fit(X_train_class_array, y_train_class)  # Обучение модели

# Предсказания
y_pred_class = custom_classifier.predict(X_test_class_array)

# Оценка качества
accuracy = accuracy_score(y_test_class, y_pred_class)
print("Accuracy (Custom Classifier):", accuracy)

Accuracy (Custom Classifier): 0.24674589700056593


In [None]:
custom_regressor = CustomLinearRegressor(lr=0.01, n_iter=5)
custom_regressor.fit(X_train_reg, y_train_reg)  # X_reg в виде массива

# Предсказания
y_pred_reg = custom_regressor.predict(X_test_reg)

# Оценка качества
print("MAE (Custom Regressor):", mean_absolute_error(y_test_reg, y_pred_reg))
print("MSE (Custom Regressor):", mean_squared_error(y_test_reg, y_pred_reg))
print("R^2 (Custom Regressor):", r2_score(y_test_reg, y_pred_reg))

MAE (Custom Regressor): 1.203808642225425e+37
MSE (Custom Regressor): 1.644749951908677e+74
R^2 (Custom Regressor): -2.7743717312470994e+60
