## Задача 1

Реализовать класс для работы с линейной регрессией

In [33]:
import pandas as pd
import numpy as np

class MyLinearRegression:
    """
    Parameters
    ----------
    regularization : {None, 'l1', 'l2', 'l1l2'}, default=None
        Какую регуляризацию добавить к модели. Если значение `None`, то без регуляризации.

    weight_calc : {'matrix', 'gd', 'sgd'}, default='matrix'
        Каким образом вычислять вектор весов: матрично ('matrix'), градиентным спуском ('gd') или стохастическим градиентным спуском ('sgd'). При этом, при 'l1' или 'l1l2' нельзя использовать параметр 'matrix'.

    Attributes
    ----------
    coefs_ : Вектор коэффициентов размера (p, 1), где p — количество признаков.
    intercept_ : Значение коэффициента, отвечающего за смещение
    """

    def __init__(self, regularization=None, weight_calc='matrix', lambda_1=None, lambda_2=None, batch_size=20, random_state=42, learning_rate=0.01):
        if regularization not in [None, 'l1', 'l2', 'l1l2']:
            raise TypeError(f"Параметр regularization не может принимать значение '{regularization}'")
        if weight_calc not in ['matrix', 'gd', 'sgd']:
            raise TypeError(f"Параметр weight_calc не может принимать значение '{weight_calc}'")
        if regularization in ['l1', 'l1l2'] and lambda_1 is None:
            raise TypeError(f"Значение коэффициента регулризации l1 не задано")
        if regularization in ['l2', 'l1l2'] and lambda_2 is None:
            raise TypeError(f"Значение коэффициента регулризации l2 не задано")
        
        self.regularization = regularization
        self.weight_calc = weight_calc
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        self.batch_size = batch_size
        self.random_state = random_state
        self.learning_rate = learning_rate


        self.coefs_ = None
        self.intercept_ = None

    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
      X = X.values
      y = y.values
      X_with_intercept = np.concatenate(([[1]]*X.shape[0], X), axis=1)
      np.random.seed(self.random_state)
      theta = np.random.uniform(-0.01, 0.01, X_with_intercept.shape[1])

      if self.weight_calc == 'matrix':
        if self.regularization == 'l2':
          indentity = np.eye(X_with_intercept.shape[1])
          indentity[0, 0] = 0
          theta = np.linalg.inv(X_with_intercept.T @ X_with_intercept + self.lambda_2*indentity) @ X_with_intercept.T @ y
        else:
          theta = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y

      else:
        for i in range(10000):
          if self.weight_calc == 'gd':
            gradient = -(2/X_with_intercept.shape[0])*X_with_intercept.T @ (y - X_with_intercept @ theta)
            if self.regularization in ['l1', 'l1l2']:
              gradient[1:] += self.lambda_1*np.sign(theta[1:])
            if self.regularization in ['l2', 'l1l2']:
              gradient[1:] += 2*self.lambda_2*theta[1:]
            if np.linalg.norm(gradient) < 1e-4:
              break
            theta -= self.learning_rate*gradient

          elif self.weight_calc == 'sgd':
            stop = False
            indices = np.random.permutation(X_with_intercept.shape[0])
            X_shuffled = X_with_intercept[indices]
            y_shuffled = y[indices]
            for j in range(0, X_with_intercept.shape[0]-self.batch_size, self.batch_size):
              gradient = -(2/self.batch_size)*X_shuffled[j:j+self.batch_size].T @ (y_shuffled[j:j+self.batch_size] - X_shuffled[j:j+self.batch_size] @ theta)
              if self.regularization in ['l1', 'l1l2']:
                gradient[1:] += self.lambda_1*np.sign(theta[1:])
              if self.regularization in ['l2', 'l1l2']:
                gradient[1:] += 2*self.lambda_2*theta[1:]
              if np.linalg.norm(gradient) < 1e-4:
                stop = True
                break
              theta -= self.learning_rate*gradient
            if stop:
              break

      self.intercept_ = theta[0]
      self.coefs_ = theta[1:]
      return self


    def predict(self, X: np.array, ss=True):
      if self.coefs_ is None:
        raise ValueError("Модель должна быть обучена перед предсказанием")
      return X @ self.coefs_ + self.intercept_


    def score(self, X: np.array, y: np.array):
      ss_res = sum((y - self.predict(X))**2)
      ss_tot = sum((y - np.mean(y))**2)
      return 1 - (ss_res/ss_tot)

Используя датасет про автомобили (целевой признак — price), сравнить (качество, скорость обучения и предсказания, важность признаков) модели `MyLinearRegression` с различными гиперпараметрами, сделать выводы. На этом же датасете сравнить модель `MyLinearRegression` с библиотечной реализацией из `sklearn`, составить таблицу(ы) (графики) результатов сравнения (качество, скорость обучения и предсказания, важность признаков).

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
from sklearn.metrics import mean_absolute_error

In [35]:
df = pd.read_csv('Used_fiat_500_in_Italy_dataset.csv')
df

Unnamed: 0,model,engine_power,transmission,age_in_days,km,previous_owners,lat,lon,price
0,pop,69,manual,4474,56779,2,45.071079,7.46403,4490
1,lounge,69,manual,2708,160000,1,45.069679,7.70492,4500
2,lounge,69,automatic,3470,170000,2,45.514599,9.28434,4500
3,sport,69,manual,3288,132000,2,41.903221,12.49565,4700
4,sport,69,manual,3712,124490,2,45.532661,9.03892,4790
...,...,...,...,...,...,...,...,...,...
375,lounge,69,manual,4474,55976,2,45.610050,9.24234,5500
376,lounge,69,manual,4200,134717,1,44.102020,9.82024,5500
377,lounge,69,manual,3470,113344,1,41.003799,16.87294,5500
378,pop,69,automatic,3712,130000,1,45.810501,8.96474,5500


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   model            380 non-null    object 
 1   engine_power     380 non-null    int64  
 2   transmission     380 non-null    object 
 3   age_in_days      380 non-null    int64  
 4   km               380 non-null    int64  
 5   previous_owners  380 non-null    int64  
 6   lat              380 non-null    float64
 7   lon              380 non-null    float64
 8   price            380 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 26.8+ KB


In [37]:
# Обработка нечисловых признаков:
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,engine_power,age_in_days,km,previous_owners,lat,lon,price,model_pop,model_sport,model_star,transmission_manual
0,69,4474,56779,2,45.071079,7.46403,4490,True,False,False,True
1,69,2708,160000,1,45.069679,7.70492,4500,False,False,False,True
2,69,3470,170000,2,45.514599,9.28434,4500,False,False,False,False
3,69,3288,132000,2,41.903221,12.49565,4700,False,True,False,True
4,69,3712,124490,2,45.532661,9.03892,4790,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
375,69,4474,55976,2,45.610050,9.24234,5500,False,False,False,True
376,69,4200,134717,1,44.102020,9.82024,5500,False,False,False,True
377,69,3470,113344,1,41.003799,16.87294,5500,False,False,False,True
378,69,3712,130000,1,45.810501,8.96474,5500,True,False,False,False


In [38]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['price'], axis=1), df['price'], test_size=0.2, random_state=42)

In [39]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

### MyLinearRegression

In [40]:
res = dict()
importance_of_features = dict()

params = [{'name': 'MyLR matrix', 'weight_calc':'matrix'},
              {'name': 'MyLR_gd', 'weight_calc':'gd'},
              {'name': 'MyLR_matrix l2', 'weight_calc': 'matrix', 'regularization': 'l2', 'lambda_2': 10},
              {'name': 'MyLR_gd_l1', 'weight_calc':'gd', 'regularization': 'l1', 'lambda_1': 10},
              {'name': 'MyLR_gd_l2', 'weight_calc':'gd', 'regularization': 'l2', 'lambda_2': 0.1},
              {'name': 'MyLR_gd_l1l2', 'weight_calc':'gd', 'regularization': 'l1l2', 'lambda_1': 0.1, 'lambda_2': 0.1},
              {'name': 'MyLR_sgd_l1', 'weight_calc':'sgd', 'regularization': 'l1', 'lambda_1': 1},
              {'name': 'MyLR_sgd_l2', 'weight_calc':'sgd', 'regularization': 'l2', 'lambda_2': 0.01},
              {'name': 'MyLR_sgd_l1l2', 'weight_calc':'sgd', 'regularization': 'l1l2', 'lambda_1': 0.1, 'lambda_2': 0.01}]

for param in params:
  name_model = param['name']
  args = {k: v for k,v in param.items() if k!='name'}

  start_t_fit = time.time()
  MyLR = MyLinearRegression(**args).fit(X_train, y_train)
  end_t_fit = time.time()

  start_t_pred = time.time()
  y_pred = MyLR.predict(X_test.values)
  end_t_pred = time.time()

  res[name_model] = [mean_absolute_error(y_test, y_pred), end_t_fit-start_t_fit, end_t_pred-start_t_pred]
  importance_of_features[name_model] = MyLR.coefs_

### Готовая библиотека sklearn

In [41]:
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso, SGDRegressor
from sklearn.metrics import r2_score
import time

start_t_fit = time.time()
sklearn_LR = LinearRegression().fit(X_train, y_train)
end_t_fit = time.time()

start_t_pred = time.time()
y_pred = sklearn_LR.predict(X_test)
end_t_pred = time.time()

res['sklearn LinReg'] = [mean_absolute_error(y_test, y_pred), end_t_fit-start_t_fit, end_t_pred-start_t_pred]
importance_of_features['sklearn LR'] = sklearn_LR.coef_

In [42]:
res_df = pd.DataFrame.from_dict(res, orient='index', columns=['MAE', 'train_time', 'pred_time'])
res_df

Unnamed: 0,MAE,train_time,pred_time
MyLR matrix,559.22439,0.000255,1e-05
MyLR_gd,559.224401,0.023873,2.2e-05
MyLR_matrix l2,561.871535,0.00013,5e-06
MyLR_gd_l1,560.155268,0.090621,1.6e-05
MyLR_gd_l2,575.200759,0.017808,1.5e-05
MyLR_gd_l1l2,575.23842,0.021459,1.8e-05
MyLR_sgd_l1,559.231976,1.225037,1.5e-05
MyLR_sgd_l2,559.287599,1.07464,1.3e-05
MyLR_sgd_l1l2,559.305299,1.237942,1.4e-05
sklearn LinReg,559.22439,0.000954,0.000343


## Задача 2

[Соревнование на Kaggle](https://kaggle.com/competitions/yadro-regression-2025)

Решение: https://colab.research.google.com/drive/1rjnG2mJcKHybON0hPAlzPd8LqRChFVP8?usp=sharing