In [2]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import ClassifierMixin
import sklearn.utils.validation as uv
from sklearn.metrics import mean_squared_error, balanced_accuracy_score
from sklearn.utils.multiclass import unique_labels
from scipy.stats import mode
from sklearn.base import RegressorMixin
from sklearn.model_selection import train_test_split


In [3]:
class MeanRegressor(RegressorMixin):
    def __init__(self):
      self.mean_ = None
    # Predicts the mean of y_train
    def fit(self, X=None, y=None):
        self.mean_ = y.mean()
        return self

    def predict(self, X=None):
        return [self.mean_]*len(X)

In [4]:
class MostFrequentClassifier(ClassifierMixin):
    # Predicts the rounded (just in case) median of y_train
    def fit(self, X=None, y=None):
      self.mode_ = mode(y).mode
      return self

    def predict(self, X=None):
      return [self.mode_]*len(X)

In [5]:
class CityMeanRegressor(RegressorMixin):
    def __init__(self):
      self.mean_msk = None
      self.mean_spb = None
    def fit(self, X=None, y=None):
        table = pd.Series(y).groupby(X['city']).mean()
        self.mean_msk = table.loc['msk']
        self.mean_spb = table.loc['spb']
        return self
    def predict(self, X=None):
        return [self.mean_msk if city == 'msk' else self.mean_spb for city in X['city']]

In [6]:
class RubricCityMedianClassifier(ClassifierMixin):
  def __init__(self):
    self.mrubs_city_df = None

  def fit(self, X=None, y=None):
    table = pd.concat([X[['city', 'modified_rubrics']], y], axis=1)
    self.mrubs_city_df = table
    return self

  def f(self, row):
    city_filter = (self.mrubs_city_df['city'] == row.loc['city'])
    rubrics_filter = (self.mrubs_city_df['modified_rubrics'] == row.loc['modified_rubrics'])
    return self.mrubs_city_df[city_filter & rubrics_filter].iloc[:, -1].median()

  def predict(self, X=None):
    return X[['city', 'modified_rubrics']].apply(self.f, axis=1)

In [7]:
base = './'#'.\\'
data = pd.read_csv(base + 'organisations.csv')
features = pd.read_csv(base + 'features.csv')
rubrics = pd.read_csv(base + 'rubrics.csv')
clean_data = data.dropna(subset=['average_bill'])
clean_data = clean_data.loc[clean_data['average_bill'] <= 2500]

clean_data_train, clean_data_test = train_test_split(clean_data,
                                                        stratify=clean_data['average_bill'],
                                                        test_size = 0.33,
                                                        random_state=42)

In [8]:
y = clean_data['average_bill']
X = clean_data.loc[:, clean_data.columns != 'average_bill']
#print(X)
table = pd.DataFrame(y).groupby(X['city']).mean()
print(table)
mean_msk = table.loc['msk']
mean_msk

      average_bill
city              
msk     792.887230
spb     676.449662


average_bill    792.88723
Name: msk, dtype: float64

In [9]:
from collections import Counter

train_rubrics = [rub.split(' ') for rub in clean_data_train['rubrics_id']]
train_rubrics_strs = []
for l in train_rubrics:
  for item in l:
    item = int(item)
  l.sort()
  for item in l:
    item = str(item)
  train_rubrics_strs.append(' '.join(l))

rubrics_counter = Counter(train_rubrics_strs)


In [10]:
modified_rubrics = pd.Series(['other' if rubrics_counter[rubs] < 100 else rubs for rubs in train_rubrics_strs], name='modified_rubrics')

#clean_data_train.insert(clean_data_train['modified_rubrics'], modified_rubrics)

In [11]:
def func(row):
    rub = row['rubrics_id'].split(' ')
    for s in rub:
      s = int(s)
    rub.sort()
    rubrics_str = ' '.join(rub)

    return 'other' if (rubrics_str not in rubrics_counter or rubrics_counter[rubrics_str] < 100) else rubrics_str

clean_data_train['modified_rubrics'] = clean_data_train.apply(func,axis=1)
clean_data_test['modified_rubrics'] = clean_data_test.apply(func, axis=1)
clean_data_test

Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,features_id,modified_rubrics
65841,14385912302763770021,spb,1000.0,4.748444,30776 30770 31401,11177 3501618484 10462 3501481355 1509 1416 20...,other
48882,16695436192794975203,msk,500.0,3.793758,30771,3501744275 273469383 3501513153 11617 10462 11...,30771
33711,11841431940065207518,msk,500.0,3.606557,30771 30777,3501773763 3501744275 3501773764 3501618484 15...,other
33544,16028521499441205186,msk,2000.0,4.683841,30776,3501618484 20422 1082283206 11704 11629 21247 ...,30776
35293,12477116204055673498,spb,500.0,4.165394,30776 31401 30770,1524 246 11704 1018 3501618484 2020795524 2124...,other
...,...,...,...,...,...,...,...
55337,9041226080397910513,msk,2500.0,4.408108,30776,11629 11704 10462 11617 3501744275 20424 35017...,30776
64048,14998683880343589209,msk,1000.0,3.555556,30776,273469383 20424 20422 246 1416 11867 11629 104...,30776
22010,1621254442333414922,msk,2000.0,4.402516,30776,273469383 21247 11867 1082283206 20422 246 101...,30776
40089,5620614742257813954,msk,500.0,,30771,11704 1018 273469383 10462 20422,30771


In [12]:
reg = MeanRegressor()
reg.fit(y=clean_data_train['average_bill'])

clf = MostFrequentClassifier()
#print(np.array(clean_data_train['average_bill']).reshape(1,-1))
clf.fit(y=clean_data_train['average_bill'])

city_reg = CityMeanRegressor()
city_reg.fit(X = clean_data_train.loc[:, clean_data_train.columns != 'average_bill'], y = clean_data_train['average_bill'])

rcmc = RubricCityMedianClassifier()
rcmc.fit(X = clean_data_train.loc[:,clean_data_train.columns != 'average_bill'], y=clean_data_train['average_bill'])

#clean_data_test.loc[:,~clean_data_test.columns.isin(['average_bill', 'features_id', 'rubrics_id'])]
#reg.predict(clean_data_test)

#reg.predict(X=clean_data_test.loc[:, ~clean_data_test.columns.isin(['average_bill', 'features_id', 'rubrics_id'])])
#clean_data_test

<__main__.RubricCityMedianClassifier at 0x188f2947310>

In [13]:
reg_rmse = np.sqrt(mean_squared_error(clean_data_test['average_bill'], reg.predict(clean_data_test)))
clf_rmse = np.sqrt(mean_squared_error(clean_data_test['average_bill'], clf.predict(clean_data_test)))
reg_city_rmse = np.sqrt(mean_squared_error(clean_data_test['average_bill'], city_reg.predict(clean_data_test.drop(columns='average_bill'))))
rcmc_rmse = np.sqrt(mean_squared_error(clean_data_test['average_bill'], rcmc.predict(clean_data_test.loc[:,clean_data_test.columns != 'average_bill'])))
#rcmc.predict(clean_data_test.loc[:,clean_data_test.columns != 'average_bill'])


clf_bas = balanced_accuracy_score(clf.predict(clean_data_test), clean_data_test['average_bill'])
#print(clf_bas)



In [14]:
print(reg_rmse)
print(clf_rmse)
print(reg_city_rmse)
print(rcmc_rmse)

448.7143889551622
514.7517402382093
445.1063281403263
390.1788175506869


Нужно составить список заведений в обучающей выборке

Далее найти количество заведений из обучающей выборки для каждого Series

Если это количество меньше 100, добавить other в modified_rubrics. Иначе перенести значение из rubrics_id

In [15]:
table = clean_data_train['average_bill'].groupby(by=[clean_data_train['city'], clean_data_train['modified_rubrics']]).median()
table['msk']['30519']
#table[table['city'] == 'msk' and table['modified_rubrics'] == '30519']

np.float64(500.0)

Поручинившись немного, возьмём на вооружение другую идею. Давайте использовать типы заведений!

Но с типами есть некоторая проблема: в столбце ``rubrics_id`` не всегда один идентификатор, часто их несколько, и всего комбинаций довольно много. Чтобы не возиться с малочисленными типами, давайте сольём их в один безликий ``other``.

Итак, добавьте в обучающие и тестовые данные столбец ``modified_rubrics``, в котором будет то же, что и в ``rubrics_id``, если соответствующая комбинация рубрик содержит хотя бы 100 заведений из обучающей (!) выборки, и строка ``other`` в противном случае.

Здесь вам поможет контейнер ``Counter`` из библиотеки ``collections``.

In [16]:
clean_data_train.head()

Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,features_id,modified_rubrics
45769,3276960721840719260,msk,500.0,4.5,30770,11704 20422 1018 11177 1416 11867 10462,30770
39061,8452997364765928283,msk,1500.0,4.442623,30774 30776,1415 3501481355 1416 11629 10462 1524 20422 11...,30774 30776
59281,14240408259222214074,spb,1000.0,4.018868,30776 30774,3502045032 11741 3502045016 10462 11704 350177...,30774 30776
51225,15114069072602161053,msk,1500.0,4.364742,31401 30776,3501513153 3501779478 3491142672 273469383 350...,other
29587,2730337118800634815,msk,1000.0,4.698718,30770,21247 10896 3491142672 11629 3501481353 350148...,30770
