Горностаев Александр, ML-11

In [1]:
# at first, mount the drive

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from scipy.stats import rv_discrete

from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import make_column_transformer
import numpy as np
from sklearn.decomposition import PCA

In [4]:
'''
imputes values from distribution for each given column
target missing values are np.nan
'''
class DiscreteDistributionImputer():
  def __init__(self, random_state=42):
    # contains distributions to generate random stuff from
    self.distributions = []
    # to fix randomness for result recheck
    self.random_state = random_state

  def fit(self, X, y=None):
    columns = []

    if isinstance(X, pd.DataFrame):
      columns = X.values.T
    elif isinstance(X, np.ndarray):
      columns = X.T
    else:
      print("given X is neither DataFrame nor ndarray")

    # clear distributions
    self.distributions = []
    for col in columns:
      # must consider only not nans
      notnans_mask = ~np.isnan(col)
      # must create distribution for each column now
      unique_vals, counts = np.unique(col[notnans_mask], return_counts=True)
      probabilities = counts / counts.sum()
      distribution = rv_discrete(values=(unique_vals, probabilities))
      self.distributions.append(distribution)
    
    return self

  def fit_transform(self, X, y = None):
    # fit first to create distributions
    self.fit(X, y)

    # transform
    return self.transform(X)
    

  def transform(self, X):
    columns = []

    # cast to float because only floats support np.nan
    if isinstance(X, pd.DataFrame):
      columns = X.values.T.copy().astype(np.float)
    elif isinstance(X, np.ndarray):
      columns = X.T.copy().astype(np.float)
    else:
      print("given X is neither DataFrame nor ndarray")

    for i, col in enumerate(columns):
      distribution = self.distributions[i]
      nan_cells_mask = np.isnan(col)
      # careful here, where returns a one sized tuple (<value>, )
      nan_cells_indices, = np.where(nan_cells_mask)
      random_values = distribution.rvs(size=nan_cells_indices.size)
      for index, random in zip(nan_cells_indices, random_values):
        col[index] = random

    return columns.T

  def get_params(self, deep=True):
    return {}

  def set_params(self, **kwargs):
    return self

In [5]:
# нахожу ближайший ответ из массива уникальных значения для каждой фичи
# переопределяю сырое значение после импутера на ближайшее к нему
def dist(data, labels,idx):
  for i in idx:
    distance = []
    for j in range(len(labels)):
      distance.append(abs(data[i]-labels[j]))
    min_dist = min(distance)
    min_idx = distance.index(min_dist)
    data[i] = labels[min_idx]

In [None]:
'''
impute nan using KNN algorithm but slightly modified
By default KNNImputer will calculate means (or medians), this implementation will instead look for closest 
values (for discrete features)
'''
from sklearn.impute import KNNImputer
class NaNImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, *discrete_columns_with_nans):
        self.columns_with_nans = discrete_columns_with_nans
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        idx = np.array(X.index[X.isnull().any(axis=1)].tolist()) # индексы строк, в которых есть наны, чтобы прогонять цикл только по ним
        g_lift_unique = X.g_lift.unique()  # убираю наны из списка уникальных значений
        g_lift_unique = g_lift_unique[~np.isnan(g_lift_unique)]
        build_tech_unique = X.build_tech.unique()
        build_tech_unique = build_tech_unique[~np.isnan(build_tech_unique)]
        metro_dist_unique = X.metro_dist.unique()
        metro_dist_unique = metro_dist_unique[~np.isnan(metro_dist_unique)]
        imputer = KNNImputer(n_neighbors=3, weights='uniform', metric='nan_euclidean') # число соседей подобрал эмпирически
        filled_data = imputer.fit_transform(X.values)
        # работает импутер не до конца корректно и на бинарных признаках может давать промежуточные значения
        # подчищаю за импутером
        filled_df = pd.DataFrame(filled_data,columns=X.columns)
        dist(filled_df.g_lift,g_lift_unique,idx)
        dist(filled_df.build_tech,build_tech_unique,idx)
        dist(filled_df.metro_dist,metro_dist_unique,idx)
        return filled_df

In [None]:
'''
finds a min date per Series (of type pd.Timestamp)
then subtracts it from each date
the result per each column is a column with amount of days
'''
class DatePreprocessor():
  def __init__(self, random_state=42):
    self.min_dates = []

  def fit(self, X, y=None):
    # columns = []
    # if (len(X.shape) == 1):
    #   print("1d array is not supported, expected 2d")

    # if isinstance(X, pd.DataFrame):
    #   columns = X.values.T
    # elif isinstance(X, np.ndarray):
    #   columns = X.T
    # else:
    #   print("given X is neither DataFrame nor ndarray")

    # for i in columns:
    #   min_date, = X.min()
    #   if isinstance(min_date, pd.Timestamp):
    #     self.min_dates.append(min_date)
    #   else:
    #     print(type(min_date))
    #     print("passed column is not of np.datetime64")

    return self

  def fit_transform(self, X, y = None):
    # fit first to create distributions
    self.fit(X, y)

    # transform
    return self.transform(X)
    

  def transform(self, X):
    columns = []
    if (len(X.shape) == 1):
      print("1d array is not supported, expected 2d")

    if isinstance(X, pd.DataFrame):
      columns = X.values.T
    elif isinstance(X, np.ndarray):
      columns = X.T
    else:
      print("given X is neither DataFrame nor ndarray")

    print(columns.dtype)
    dates = np.ones(X.T.shape, dtype=np.int)

    for index, col in enumerate(columns):
      dates[index] = np.array(list(map(lambda date: pd.Timestamp(date).month, col)))

    return dates.T

  def get_params(self, deep=True):
    return {}

  def set_params(self, **kwargs):
    return self

In [None]:
'''
casts all passed columns to int

may raise error if cannot cast series to int (if, e.g., nan is encountered)
'''
class CategoricalFeaturesToIntPreprocessor(BaseEstimator, TransformerMixin):
  def __init__(self):
      pass
  
  def fit(self, X, y=None):
      return self
  
  def fit_transform(self, X, y = None):
    # fit first to create distributions
    self.fit(X, y)

    # transform
    return self.transform(X)
  
  def transform(self, X):
    columns = []
    print(type(X.head()))
    print(X.head(), X.shape)
    columns = X.T

    for index in range(len(columns)):
      columns[index] = columns[index].astype(np.int)
    
    return columns.T

In [None]:
'''
drops nans
'''
class NanDropper(BaseEstimator, TransformerMixin):
  def __init__(self):
      pass
  
  def fit(self, X, y=None):
      return self
  
  def fit_transform(self, X, y = None):
    # fit first to create distributions
    self.fit(X, y)

    # transform
    return self.transform(X)
  
  def transform(self, X, *y):
    X = X.dropna()
    return X

In [None]:
def pandarizer(columns, *placed_to_first, **options):
    if (len(placed_to_first) != 0):
        columns = names_at_start(columns, *placed_to_first)
        
    def framerize(x):
        frame = pd.DataFrame(x, columns = columns)
        if len(options) > 0:
            int_columns = options['toint']
            for int_column in int_columns:
                frame[int_column] = frame[int_column].astype(np.int)
        return frame
            
    return FunctionTransformer(lambda x: framerize(x))

def names_at_start(columns, *placed_to_first):
    columns = columns.copy()
    if not isinstance(columns, list):
        columns = columns.tolist()
        
    for name in placed_to_first[::-1]:
        if name not in columns:
            print(name, "not in list")
        columns.remove(name)
        columns.insert(0, name)
    return columns

def without(columns, *without_names):
    columns = columns.copy()
    if not isinstance(columns, list):
        columns = columns.tolist()
        
    for name in without_names:
        columns.remove(name)
    return columns

## Подгрузить данные



In [None]:
gdrive_path_train = "/content/drive/MyDrive/Технопарк 2021 ML/lecture05/Train.csv"
local_path_train = "Train.csv"
gdrive_path_test = "/content/drive/MyDrive/Технопарк 2021 ML/lecture05/Test.csv"
local_path_test = "Test.csv"

data_train = pd.read_csv(local_path_train, index_col=None)
data_test = pd.read_csv(local_path_test, index_col=None)

data_train['date'] = pd.to_datetime(data_train['date'])
data_test['date'] = pd.to_datetime(data_test['date'])
data_train.drop(data_train.columns[[0]], axis=1,inplace=True)
data_test.drop(data_test.columns[[0]], axis=1,inplace=True)

In [None]:
data_train.head()

## Поиск модели для предсказания стоимости квартиры

In [None]:
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(data_train.iloc[:, :-1], data_train.iloc[:, -1], shuffle=False)

Относительная погрешность


In [None]:
def relative_error(y_true, y_pred): 
  return 1 - (np.abs((y_true - y_pred) / y_train_test).mean())

### CatBoost 

In [None]:
from catboost import Pool, CatBoostRegressor

In [None]:
preprocess_model = make_pipeline(
    make_column_transformer(
        # convert date to months
        (DatePreprocessor(), ['date']),
        # fill nans for columns using their distributions
        (DiscreteDistributionImputer(), ['build_tech', 'g_lift']),
        # fill left nans simply
        remainder=SimpleImputer()
    ),
    # pandarize for other columntransform conversion, date, build_Tech and g_lift labels go first after first CT conversion
    pandarizer(X_labels, 'date', 'build_tech', 'g_lift'),
    # drop kw columns that are highly disbalanced
#     make_column_transformer(
#         ('drop', ["kw{}".format(i) for i in range(1, 14) if i != 2]),
#         remainder='passthrough'
#     ),
    # pandarize again to check that everything is as must
#     pandarizer(without(X_labels, *["kw{}".format(i) for i in range(1, 14) if i != 2]), 
#                'date', 'build_tech', 'g_lift',
#                toint=['date', 'build_tech', 'g_lift', 'street_id', 'floor', 'area', 'rooms', 'balcon', 'n_photos', 'kw2']
#               ),
    pandarizer(X_labels, 
           'date', 'build_tech', 'g_lift',
           toint=['date', 'build_tech', 'g_lift', 'street_id', 'floor', 'area', 'rooms', 'balcon', 'n_photos', 'kw2']
          ),
    'passthrough'
)

In [None]:
preprocess_model.fit_transform(X_train_train)

In [None]:
model = CatBoostRegressor(learning_rate=0.5, depth=10, eval_metric = 'MAE', task_type='GPU', iterations=3000) 

In [None]:
pipeline_model = make_pipeline(
    preprocess_model,
    model
)

In [None]:
pipeline_model.fit(X_train_train, y_train_train)

In [None]:
y_train_test_pred = pipeline_model.predict(X_train_test)

In [None]:
relative_error(y_train_test, y_train_test_pred)

In [None]:
mean_absolute_error(y_train_test,y_train_test_pred )

Пока качество не айс. Ну чтош, сравню хотя бы как эта штука отрабатывает, и SimpleSolution в кегле на тестовых данных

### LightGBM

In [None]:
preprocess_model.fit_transform(X_train_train)

In [None]:
!pip install lightgbm

In [None]:
from lightgbm import LGBMRegressor

In [None]:
preprocess_model = make_pipeline(
    make_column_transformer(
        # convert date to months
        (DatePreprocessor(), ['date']),
        # fill nans for columns using their distributions
#         (DiscreteDistributionImputer(), ['build_tech', 'g_lift']),
        # fill left nans simply
        remainder='passthrough'
    ),
    # pandarize for other columntransform conversion, date, build_Tech and g_lift labels go first after first CT conversion
    pandarizer(X_train_train.columns, 'date'),
    # drop nans
#     NanDropper(),
    # drop kw columns that are highly disbalanced
#     make_column_transformer(
#         ('drop', ["kw{}".format(i) for i in range(1, 14) if i != 2]),
#         remainder='passthrough'
#     ),
    # pandarize again to check that everything is as must
#     pandarizer(without(X_labels, *["kw{}".format(i) for i in range(1, 14) if i != 2]), 
#                'date', 'build_tech', 'g_lift',
#                toint=['date', 'build_tech', 'g_lift', 'street_id', 'floor', 'area', 'rooms', 'balcon', 'n_photos', 'kw2']
#               ),
#     pandarizer(X_train_train.columns, 
#            'date',
#            toint=['date', 'build_tech', 'g_lift', 'street_id', 'floor', 'area', 'rooms', 'balcon', 'n_photos', 'kw2']
#           ),
    'passthrough'
)

In [None]:
preprocess_model_normal = make_pipeline(
    make_column_transformer(
        # convert date to months
        (DatePreprocessor(), ['date']),
        # fill nans for columns using their distributions
        (DiscreteDistributionImputer(), ['build_tech', 'g_lift']),
        # fill left nans simply
        remainder=SimpleImputer()
    ),
    # pandarize for other columntransform conversion, date, build_Tech and g_lift labels go first after first CT conversion
    pandarizer(X_train_train.columns, 'date', 'build_tech', 'g_lift'),
    # drop kw columns that are highly disbalanced
#     make_column_transformer(
#         ('drop', ["kw{}".format(i) for i in range(1, 14) if i != 2]),
#         remainder='passthrough'
#     ),
    # pandarize again to check that everything is as must
#     pandarizer(without(X_labels, *["kw{}".format(i) for i in range(1, 14) if i != 2]), 
#                'date', 'build_tech', 'g_lift',
#                toint=['date', 'build_tech', 'g_lift', 'street_id', 'floor', 'area', 'rooms', 'balcon', 'n_photos', 'kw2']
#               ),
#     pandarizer(X_train_train.columns, 
#            'date',
#            toint=['date', 'build_tech', 'g_lift', 'street_id', 'floor', 'area', 'rooms', 'balcon', 'n_photos', 'kw2']
#           ),
    'passthrough'
)

In [None]:
X_train_train.columns

In [None]:
data_train_prep = preprocess_model.fit_transform(X_train_train).copy()

In [None]:
data_train_prep = pd.DataFrame(np.hstack([data_train_prep, y_train_train[:, np.newaxis]]))
data_train_prep = data_train_prep.dropna()

In [None]:
lgbm_model = LGBMRegressor()
lgbm_model.fit(data_train_prep.iloc[:, :-1], data_train_prep.iloc[:, -1])

In [None]:
predicts = lgbm_model.predict(preprocess_model.fit_transform(X_train_test))

In [None]:
relative_error(y_train_test, predicts)

In [None]:
mean_absolute_error(y_train_test, predicts)

## Использование выбранной модели на тесте

In [None]:
y_test_pred = pipeline.predict(data_train)

In [None]:
answers = np.hstack((np.arange(100000, 200000)[:, np.newaxis], y_test_pred[:, np.newaxis]))

In [None]:
answers_pd = pd.DataFrame(answers, columns=["id", "price"])
answers_pd['id'] = answers_pd['id'].astype(np.int)
answers_pd.set_index('id')

In [None]:
answers_pd.to_csv("/content/drive/MyDrive/Технопарк 2021 ML/lecture05/submission.csv", index=False)

In [None]:
data_train = pd.read_csv("/content/drive/MyDrive/Технопарк 2021 ML/lecture05/Train.csv", index_col=None)
data_test = pd.read_csv("/content/drive/MyDrive/Технопарк 2021 ML/lecture05/Test.csv", index_col=None)

data_train['date'] = pd.to_datetime(data_train['date'])
data_test['date'] = pd.to_datetime(data_test['date'])
data_train.drop(data_train.columns[[0]], axis=1,inplace=True)
data_test.drop(data_test.columns[[0]], axis=1,inplace=True)

In [None]:
model = CatBoostRegressor(iterations=1000,
                          l2_leaf_reg= 1,
                          depth=6,
                          eval_metric='MAE',
                           learning_rate=0.6,
                           loss_function='RMSE',
                           verbose=True)

In [None]:
pipeline = Pipeline(steps=[
                           ('preproc', ColumnTransformer([
                                              ('drop', 'drop', ['date']),
                                              ('impute', SimpleImputer(strategy='median'), ['metro_dist', 'build_tech', 'g_lift'])
                           ], remainder='passthrough')),
                           ('model', model)
])

In [None]:
pipeline.fit(data_train.iloc[:, :-1], data_train.iloc[:, -1])

In [None]:
predicts = pipeline.predict(X_train_test)

In [None]:
relative_error(predicts, data_train.iloc[:, -1])