# kaggle Study 40일차(Mercari)
코드출처 : https://www.kaggle.com/rumbok/ridge-lb-0-41944

In [1]:
import multiprocessing as mp
import pandas as pd
from time import time
from scipy.sparse import csr_matrix
import os
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import gc
from sklearn.base import BaseEstimator, TransformerMixin
import re
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

In [2]:
os.environ['MKL_NUM_THREADS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['JOBLIB_START_METHOD'] = 'forkserver'

INPUT_PATH = r'C:/Users/이동훈/Desktop/github/kaggle/kagglestudy/Data/Mercari/'

In [3]:
def dameraulevenshtein(seq1, seq2):
    """
    시퀀스 간 거리를 다메라우-레벤쉬틴으로 계산합니다.

    이 메서드는 원본에서 수정되지 않았습니다.
    출처: http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/

    이 거리는 추가, 삭제, 대체 횟수입니다.
    첫 번째 시퀀스를 다음 순서로 변환하는 데 필요한 전환 작업입니다.
    둘째 일반적으로 문자열과 함께 사용되지만 다음과 같은 시퀀스가 있습니다.
    유사한 개체가 작동합니다.

    전환은 *연속*자 교환이며, 기타 모든 문자를 교환합니다.
    운영은 스스로 수행해야 합니다.

    이 구현은 O(N*M) 시간복잡도와 O(M) 공간복잡도를 가집니다. N과 M의 경우
    두 시퀀스의 길이입니다.

    >>  dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2

    임의 시퀀스에서도 작동합니다.
    >> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    # 코드 조각:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # 개념적으로, 이것은 len(seq1) + 1 * len(seq2) + 1 행렬을 기반으로 합니다.
    # 단, 현재 행과 이전 행 2개만 동시에 필요합니다.
    # 그래서 그것들만 저장합니다.
    
    oneago = None
    thisrow = list(range(1, len(seq2) + 1)) + [0]
    for x in range(len(seq1)):
        # 파이썬 리스트는 네거티브 지수를 중심으로 정리되니까, 그 값을 매기세요.
        # 목록의 *끝*에 있는 가장 왼쪽 열입니다. 이것은 와 일치합니다.
        # 영점 조정 문자열과 추가 계산을 저장합니다.
        twoago, oneago, thisrow = (oneago, thisrow, [0] * len(seq2) + [x + 1])
        for y in range(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # 이 블록은 전환을 처리합니다.
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                    and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]


In [4]:
class SymSpell:
    def __init__(self, max_edit_distance=3, verbose=0):
        self.max_edit_distance = max_edit_distance
        self.verbose = verbose
        # 0: 최상위 제안입니다.
        # 1: 편집 거리가 가장 작은 모든 제안입니다.
        # 2: 모든 제안 = max_edit_distance(초기 종료 없음)

        self.dictionary = {}
        self.longest_word_length = 0

    def get_deletes_list(self, w):
        """"단어가 주어지면 최대 max_edit_distance 문자가 삭제된 문자열을 파생합니다."""

        deletes = []
        queue = [w]
        for d in range(self.max_edit_distance):
            temp_queue = []
            for word in queue:
                if len(word) > 1:
                    for c in range(len(word)):  # character index
                        word_minus_c = word[:c] + word[c + 1:]
                        if word_minus_c not in deletes:
                            deletes.append(word_minus_c)
                        if word_minus_c not in temp_queue:
                            temp_queue.append(word_minus_c)
            queue = temp_queue

        return deletes

    def create_dictionary_entry(self, w):
        '''단어와 단어에서 파생된 삭제 내용을 사전에 추가합니다.'''
        # 단어가 사전에 이미 있는지 확인합니다.
        # 사전 항목은 다음과 같은 형식입니다: (제안된 수정 목록,말뭉치의 빈도)
        
        new_real_word_added = False
        if w in self.dictionary:
            # 말뭉치의 단어 증가 개수입니다.
            self.dictionary[w] = (self.dictionary[w][0], self.dictionary[w][1] + 1)
        else:
            self.dictionary[w] = ([], 1)
            self.longest_word_length = max(self.longest_word_length, len(w))

        if self.dictionary[w][1] == 1:
           # 말뭉치에서 단어가 처음 등장합니다.
            # n.b. 단어는 파생 단어로 이미 사전에 있을 수 있습니다.
            # (실제 단어에서 문자를 추출합니다)
            # 그러나 말뭉치에서 단어의 빈도수는 증가하지 않습니다.
            # 이 경우)
            new_real_word_added = True
            deletes = self.get_deletes_list(w)
            for item in deletes:
                if item in self.dictionary:
                    # 삭제의 수정 목록에 단어를 추가합니다(수정).
                    self.dictionary[item][0].append(w)
                else:
                    # 말뭉치의 단어 빈도는 증가하지 않습니다.
                    self.dictionary[item] = ([w], 0)

        return new_real_word_added

    def create_dictionary_from_arr(self, arr, token_pattern=r'[a-z]+'):
        total_word_count = 0
        unique_word_count = 0

        for line in arr:
            # 단어로 구분합니다. 비문자로 구분합니다.
            words = re.findall(token_pattern, line.lower())
            for word in words:
                total_word_count += 1
                if self.create_dictionary_entry(word):
                    unique_word_count += 1

        print("total words processed: %i" % total_word_count)
        print("total unique words in corpus: %i" % unique_word_count)
        print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
        print("  edit distance for deletions: %i" % self.max_edit_distance)
        print("  length of longest word in corpus: %i" % self.longest_word_length)
        return self.dictionary

    def create_dictionary(self, fname):
        total_word_count = 0
        unique_word_count = 0

        with open(fname) as file:
            for line in file:
                # 단어로 구분합니다. 비문자로 구분합니다.
                words = re.findall('[a-z]+', line.lower())
                for word in words:
                    total_word_count += 1
                    if self.create_dictionary_entry(word):
                        unique_word_count += 1

        print("total words processed: %i" % total_word_count)
        print("total unique words in corpus: %i" % unique_word_count)
        print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
        print("  edit distance for deletions: %i" % self.max_edit_distance)
        print("  length of longest word in corpus: %i" % self.longest_word_length)
        return self.dictionary

    def get_suggestions(self, string, silent=False):
        """철자가 잘못되었을 수 있는 단어에 대해 제안된 수정 목록을 반환합니다."""
        if (len(string) - self.longest_word_length) > self.max_edit_distance:
            if not silent:
                print("no items in dictionary within maximum edit distance")
            return []

        suggest_dict = {}
        min_suggest_len = float('inf')

        queue = [string]
        q_dictionary = {}  # 저희가 확인한 문자열 이외의 항목입니다.

        while len(queue) > 0:
            q_item = queue[0]  # pop
            queue = queue[1:]

            # 조기 종료
            if ((self.verbose < 2) and (len(suggest_dict) > 0) and
                    ((len(string) - len(q_item)) > min_suggest_len)):
                break

            # 공정대기열항목
            if (q_item in self.dictionary) and (q_item not in suggest_dict):
                if self.dictionary[q_item][1] > 0:
                    # 단어는 사전에 있고, 말뭉치에서 온 단어입니다.
                    # 제안 목록에 아직 없습니다. 제안서에 추가하세요.
                    # 사전, 값이 있는 단어에 의해 색인화됩니다(빈도 입력).
                    # 말뭉치, 거리 편집)
                    # 입력 문자열이 아닌 note q_tword가 더 짧습니다.
                    #은(는) 삭제만 추가되었으므로 입력 문자열보다 큽니다(비공개).
                    # 수동사전 수정사항 추가)
                    assert len(string) >= len(q_item)
                    suggest_dict[q_item] = (self.dictionary[q_item][1],
                                            len(string) - len(q_item))
                    # 조기 종료
                    if (self.verbose < 2) and (len(string) == len(q_item)):
                        break
                    elif (len(string) - len(q_item)) < min_suggest_len:
                        min_suggest_len = len(string) - len(q_item)

                # 사전에 저장된 q_item에 대해 제안된 수정사항(q_item 자체가 유효한 단어인지 아니면 단순히 삭제인지 여부)은 유효한 수정사항이 될 수 있습니다.
                for sc_item in self.dictionary[q_item][0]:
                    if sc_item not in suggest_dict:

                        # 편집 거리를 계산합니다.
                        # 제안 항목은 항상 더 길어야 합니다.
                        # (수동 수정이 추가되지 않는 경우)
                        assert len(sc_item) > len(q_item)

                        # q_items은 더 짧아야 합니다.
                        # 원래 문자열보다 더 많습니다.
                        # (수동 수정이 추가되지 않은 경우)
                        assert len(q_item) <= len(string)

                        if len(q_item) == len(string):
                            assert q_item == string
                            item_dist = len(sc_item) - len(q_item)

                        # 제안 목록의 항목은 문자열 자체와 동일하면 안 됩니다.
                        assert sc_item != string

                        
                        item_dist = dameraulevenshtein(sc_item, string)

                        # 예를 들어, 다메라우-레벤슈틴 거리 같은 공식을 사용하여 편집 거리를 계산합니다.
                        if (self.verbose < 2) and (item_dist > min_suggest_len):
                            pass
                        elif item_dist <= self.max_edit_distance:
                            assert sc_item in self.dictionary # 제안 목록에 있는 경우는 이미 사전에 있어야 합니다.
                            suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist)
                            if item_dist < min_suggest_len:
                                min_suggest_len = item_dist

                       # 주문단어에 따라 처리되는 단어도 있습니다.
                        # 편집 거리가 다른 경우 에 입력할 수 있습니다.
                        # 제안; 상세할 경우 제안 사전을 잘라냅니다.
                        # 켜지지 않습니다.
                        if self.verbose < 2:
                            suggest_dict = {k: v for k, v in suggest_dict.items() if v[1] <= min_suggest_len}

            # 이제 삭제(예: 문자열의 하위 문자열 또는 삭제)를 생성합니다.
            # 큐 항목에서 가져옵니다.
            # 체크할 추가 항목으로 -- 큐 끝에 추가합니다.
            assert len(string) >= len(q_item)

            # 상세 설정이 켜져 있지 않으면 편집 거리가 더 큰 단어를 추가하지 않습니다.
            if (self.verbose < 2) and ((len(string) - len(q_item)) > min_suggest_len):
                pass
            elif (len(string) - len(q_item)) < self.max_edit_distance and len(q_item) > 1:
                for c in range(len(q_item)):  # character index
                    word_minus_c = q_item[:c] + q_item[c + 1:]
                    if word_minus_c not in q_dictionary:
                        queue.append(word_minus_c)
                        q_dictionary[word_minus_c] = None  # arbitrary value, just to identify we checked this

        # 대기열이 비어 있습니다. 사전의 제안을 출력할 목록으로 변환합니다.
        if not silent and self.verbose != 0:
            print("number of possible corrections: %i" % len(suggest_dict))
            print("  edit distance for deletions: %i" % self.max_edit_distance)

        # 출력 옵션 1입니다.
        # 편집거리와 내림차순으로 결과를 정렬합니다.
        # 빈도 순서 및 제안된 단어 수정 목록만 반환합니다.
        # sorted(suggest_dict, key = lambda x: (suggest_dict[x][1], -suggest_dict[x][0])) 를 반환홥니다.

        # 출력 옵션 2
        # 제안 목록을 (correction,frequency in corpus, edit distance)과 함께 반환합니다.:
        as_list = suggest_dict.items()
        # outlist = sorted(as_list, key=lambda (term, (freq, dist)): (dist, -freq))
        outlist = sorted(as_list, key=lambda x: (x[1][1], -x[1][0]))

        if self.verbose == 0:
            return outlist[0]
        else:
            return outlist

        '''
        Option 1:
        ['file', 'five', 'fire', 'fine', ...]

        Option 2:
        [('file', (5, 0)),
         ('five', (67, 1)),
         ('fire', (54, 1)),
         ('fine', (17, 1))...]  
        '''

    def best_word(self, s, silent=False):
        try:
            return self.get_suggestions(s, silent)[0]
        except:
            return None


In [5]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field, start_time=time()):
        self.field = field
        self.start_time = start_time

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        print(f'[{time()-self.start_time}] select {self.field}')
        dt = dataframe[self.field].dtype
        if is_categorical_dtype(dt):
            return dataframe[self.field].cat.codes[:, None]
        elif is_numeric_dtype(dt):
            return dataframe[self.field][:, None]
        else:
            return dataframe[self.field]

In [7]:
class DropColumnsByDf(BaseEstimator, TransformerMixin):
    def __init__(self, min_df=1, max_df=1.0):
        self.min_df = min_df
        self.max_df = max_df

    def fit(self, X, y=None):
        m = X.tocsc()
        self.nnz_cols = ((m != 0).sum(axis=0) >= self.min_df).A1
        if self.max_df < 1.0:
            max_df = m.shape[0] * self.max_df
            self.nnz_cols = self.nnz_cols & ((m != 0).sum(axis=0) <= max_df).A1
        return self

    def transform(self, X, y=None):
        m = X.tocsc()
        return m[:, self.nnz_cols]

In [6]:
def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))


def split_cat(text):
    try:
        cats = text.split("/")
        return cats[0], cats[1], cats[2], cats[0] + '/' + cats[1]
    except:
        print("no category")
        return 'other', 'other', 'other', 'other/other'


def brands_filling(dataset):
    vc = dataset['brand_name'].value_counts()
    brands = vc[vc > 0].index
    brand_word = r"[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+"

    many_w_brands = brands[brands.str.contains(' ')]
    one_w_brands = brands[~brands.str.contains(' ')]

    ss2 = SymSpell(max_edit_distance=0)
    ss2.create_dictionary_from_arr(many_w_brands, token_pattern=r'.+')

    ss1 = SymSpell(max_edit_distance=0)
    ss1.create_dictionary_from_arr(one_w_brands, token_pattern=r'.+')

    two_words_re = re.compile(r"(?=(\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+))")

    def find_in_str_ss2(row):
        for doc_word in two_words_re.finditer(row):
            print(doc_word)
            suggestion = ss2.best_word(doc_word.group(1), silent=True)
            if suggestion is not None:
                return doc_word.group(1)
        return ''

    def find_in_list_ss1(list):
        for doc_word in list:
            suggestion = ss1.best_word(doc_word, silent=True)
            if suggestion is not None:
                return doc_word
        return ''

    def find_in_list_ss2(list):
        for doc_word in list:
            suggestion = ss2.best_word(doc_word, silent=True)
            if suggestion is not None:
                return doc_word
        return ''

    print(f"Before empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")

    n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(
        pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+")
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_name]

    n_desc = dataset[dataset['brand_name'] == '']['item_description'].str.findall(
        pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+")
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_desc]

    n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(pat=brand_word)
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in n_name]

    desc_lower = dataset[dataset['brand_name'] == '']['item_description'].str.findall(pat=brand_word)
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in desc_lower]

    print(f"After empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")

    del ss1, ss2
    gc.collect()


In [8]:
def preprocess_regex(dataset, start_time=time()):
    karats_regex = r'(\d)([\s-]?)(karat|karats|carat|carats|kt)([^\w])'
    karats_repl = r'\1k\4'

    unit_regex = r'(\d+)[\s-]([a-z]{2})(\s)'
    unit_repl = r'\1\2\3'

    dataset['name'] = dataset['name'].str.replace(karats_regex, karats_repl)
    dataset['item_description'] = dataset['item_description'].str.replace(karats_regex, karats_repl)
    print(f'[{time() - start_time}] Karats normalized.')

    dataset['name'] = dataset['name'].str.replace(unit_regex, unit_repl)
    dataset['item_description'] = dataset['item_description'].str.replace(unit_regex, unit_repl)
    print(f'[{time() - start_time}] Units glued.')


def preprocess_pandas(train, test, start_time=time()):
    train = train[train.price > 0.0].reset_index(drop=True)
    print('Train shape without zero price: ', train.shape)

    nrow_train = train.shape[0]
    y_train = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, test])

    del train
    del test
    gc.collect()

    merge['has_category'] = (merge['category_name'].notnull()).astype('category')
    print(f'[{time() - start_time}] Has_category filled.')

    merge['category_name'] = merge['category_name'] \
        .fillna('other/other/other') \
        .str.lower() \
        .astype(str)
    merge['general_cat'], merge['subcat_1'], merge['subcat_2'], merge['gen_subcat1'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    print(f'[{time() - start_time}] Split categories completed.')

    merge['has_brand'] = (merge['brand_name'].notnull()).astype('category')
    print(f'[{time() - start_time}] Has_brand filled.')

    merge['gencat_cond'] = merge['general_cat'].map(str) + '_' + merge['item_condition_id'].astype(str)
    merge['subcat_1_cond'] = merge['subcat_1'].map(str) + '_' + merge['item_condition_id'].astype(str)
    merge['subcat_2_cond'] = merge['subcat_2'].map(str) + '_' + merge['item_condition_id'].astype(str)
    print(f'[{time() - start_time}] Categories and item_condition_id concancenated.')

    merge['name'] = merge['name'] \
        .fillna('') \
        .str.lower() \
        .astype(str)
    merge['brand_name'] = merge['brand_name'] \
        .fillna('') \
        .str.lower() \
        .astype(str)
    merge['item_description'] = merge['item_description'] \
        .fillna('') \
        .str.lower() \
        .replace(to_replace='No description yet', value='')
    print(f'[{time() - start_time}] Missing filled.')

    preprocess_regex(merge, start_time)

    brands_filling(merge)
    print(f'[{time() - start_time}] Brand name filled.')

    merge['name'] = merge['name'] + ' ' + merge['brand_name']
    print(f'[{time() - start_time}] Name concancenated.')

    merge['item_description'] = merge['item_description'] \
                                + ' ' + merge['name'] \
                                + ' ' + merge['subcat_1'] \
                                + ' ' + merge['subcat_2'] \
                                + ' ' + merge['general_cat'] \
                                + ' ' + merge['brand_name']
    print(f'[{time() - start_time}] Item description concatenated.')

    merge.drop(['price', 'test_id', 'train_id'], axis=1, inplace=True)

    return merge, y_train, nrow_train


In [9]:
def intersect_drop_columns(train: csr_matrix, valid: csr_matrix, min_df=0):
    t = train.tocsc()
    v = valid.tocsc()
    nnz_train = ((t != 0).sum(axis=0) >= min_df).A1
    nnz_valid = ((v != 0).sum(axis=0) >= min_df).A1
    nnz_cols = nnz_train & nnz_valid
    res = t[:, nnz_cols], v[:, nnz_cols]
    return res

In [11]:
if __name__ == '__main__':
    #mp.set_start_method('forkserver', True)

    start_time = time()

    train = pd.read_table(os.path.join(INPUT_PATH, 'train.tsv'),
                          engine='c',
                          dtype={'item_condition_id': 'category',
                                 'shipping': 'category'}
                          )
    test = pd.read_table(os.path.join(INPUT_PATH, 'test.tsv'),
                         engine='c',
                         dtype={'item_condition_id': 'category',
                                'shipping': 'category'}
                         )
    print(f'[{time() - start_time}] Finished to load data')
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)

    submission: pd.DataFrame = test[['test_id']]

    merge, y_train, nrow_train = preprocess_pandas(train, test, start_time)

    meta_params = {'name_ngram': (1, 2),
                   'name_max_f': 75000,
                   'name_min_df': 10,

                   'category_ngram': (2, 3),
                   'category_token': '.+',
                   'category_min_df': 10,

                   'brand_min_df': 10,

                   'desc_ngram': (1, 3),
                   'desc_max_f': 150000,
                   'desc_max_df': 0.5,
                   'desc_min_df': 10}

    stopwords = frozenset(['the', 'a', 'an', 'is', 'it', 'this', ])
    # 'i', 'so', 'its', 'am', 'are'])

    vectorizer = FeatureUnion([
        ('name', Pipeline([
            ('select', ItemSelector('name', start_time=start_time)),
            ('transform', HashingVectorizer(
                ngram_range=(1, 2),
                n_features=2 ** 27,
                norm='l2',
                lowercase=False,
                stop_words=stopwords
            )),
            ('drop_cols', DropColumnsByDf(min_df=2))
        ])),
        ('category_name', Pipeline([
            ('select', ItemSelector('category_name', start_time=start_time)),
            ('transform', HashingVectorizer(
                ngram_range=(1, 1),
                token_pattern='.+',
                tokenizer=split_cat,
                n_features=2 ** 27,
                norm='l2',
                lowercase=False
            )),
            ('drop_cols', DropColumnsByDf(min_df=2))
        ])),
        ('brand_name', Pipeline([
            ('select', ItemSelector('brand_name', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('gencat_cond', Pipeline([
            ('select', ItemSelector('gencat_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('subcat_1_cond', Pipeline([
            ('select', ItemSelector('subcat_1_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('subcat_2_cond', Pipeline([
            ('select', ItemSelector('subcat_2_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('has_brand', Pipeline([
            ('select', ItemSelector('has_brand', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('shipping', Pipeline([
            ('select', ItemSelector('shipping', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('item_condition_id', Pipeline([
            ('select', ItemSelector('item_condition_id', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('item_description', Pipeline([
            ('select', ItemSelector('item_description', start_time=start_time)),
            ('hash', HashingVectorizer(
                ngram_range=(1, 3),
                n_features=2 ** 27,
                dtype=np.float32,
                norm='l2',
                lowercase=False,
                stop_words=stopwords
            )),
            ('drop_cols', DropColumnsByDf(min_df=2)),
        ]))
    ], n_jobs=1)

    sparse_merge = vectorizer.fit_transform(merge)
    print(f'[{time() - start_time}] Merge vectorized')
    print(sparse_merge.shape)

    tfidf_transformer = TfidfTransformer()

    X = tfidf_transformer.fit_transform(sparse_merge)
    print(f'[{time() - start_time}] TF/IDF completed')

    X_train = X[:nrow_train]
    print(X_train.shape)

    X_test = X[nrow_train:]
    del merge
    del sparse_merge
    del vectorizer
    del tfidf_transformer
    gc.collect()

    X_train, X_test = intersect_drop_columns(X_train, X_test, min_df=1)
    print(f'[{time() - start_time}] Drop only in train or test cols: {X_train.shape[1]}')
    gc.collect()

    ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200, normalize=False, tol=0.01)
    ridge.fit(X_train, y_train)
    print(f'[{time() - start_time}] Train Ridge completed. Iterations: {ridge.n_iter_}')

    predsR = ridge.predict(X_test)
    print(f'[{time() - start_time}] Predict Ridge completed.')

    submission.loc[:, 'price'] = np.expm1(predsR)
    submission.loc[submission['price'] < 0.0, 'price'] = 0.0
    submission.to_csv("submission_ridge.csv", index=False)

[9.203696489334106] Finished to load data
Train shape:  (1482535, 8)
Test shape:  (693359, 7)
Train shape without zero price:  (1481661, 8)
[11.358418226242065] Has_category filled.
[25.330225706100464] Split categories completed.
[25.587501287460327] Has_brand filled.
[30.067643880844116] Categories and item_condition_id concancenated.
[46.19376564025879] Missing filled.
[59.42327094078064] Karats normalized.
[78.89431381225586] Units glued.
total words processed: 2671
total unique words in corpus: 2671
total items in dictionary (corpus words and deletions): 2671
  edit distance for deletions: 0
  length of longest word in corpus: 39
total words processed: 2616
total unique words in corpus: 2616
total items in dictionary (corpus words and deletions): 2616
  edit distance for deletions: 0
  length of longest word in corpus: 15
Before empty brand_name: 927861
After empty brand_name: 252719
[229.68189477920532] Brand name filled.
[233.8591697216034] Name concancenated.
[265.3134882450104



[389.9109752178192] select brand_name
[413.12118554115295] select gencat_cond
[423.3947026729584] select subcat_1_cond
[436.4934585094452] select subcat_2_cond
[447.0983245372772] select has_brand


  return dataframe[self.field].cat.codes[:, None]


[447.4007132053375] select shipping


  return dataframe[self.field].cat.codes[:, None]


[447.62462306022644] select item_condition_id


  return dataframe[self.field].cat.codes[:, None]


[447.8530125617981] select item_description


MemoryError: Unable to allocate 733. MiB for an array with shape (192137448,) and data type int32