# Ridge (LB 0.41943)
https://www.kaggle.com/rumbok/ridge-lb-0-41944

In [1]:
import multiprocessing as mp
import pandas as pd
from time import time
from scipy.sparse import csr_matrix
import os
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import gc
from sklearn.base import BaseEstimator, TransformerMixin
import re
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

In [2]:
os.environ['MKL_NUM_THREADS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['JOBLIN_START_METHOD'] = 'forkserver'

In [3]:
INPUT_PATH = r'./input'

In [4]:
def dameraulevenshtein(seq1, seq2):
    """
    시퀀스 간 Damerau-Levenshtein 거리 계산
    
    이 거리는 추가, 삭제, 대체 횟수입니다.
    첫 번째 시퀀스를 두 번째로 변환하는 데 필요한 전환 작업입니다.
    일반적으로 문자열과 함께 사용되지만 비교할 수 있는 개체의 시퀀스는 모두 작동합니다.
    
    대체는 '연속'글자를 교환하는 것이며, 다른 모든 작업은 자체 설명이 가능합니다.
    
    이 구현은 O(N*M) 시간과 O(M) 공간입니다. N과 M의 경우 두 시퀀스의 길이입니다.
    
    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2
    
    임의 시퀀스에서도 작동합니다.
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    # 개념적으로 len(seq1) + len(seq2) + 1 matrix을 기초로 합니다.
    # 그러나 현재와, 두 개의 이전 열만 한 번에 필요합니다.
    # 따라서 이들을 저장합니다.
    oneago = None
    thisrow = list(range(1, len(seq2) + 1)) + [0]
    for x in range(len(seq1)):
        # 파이썬 리스트는 음수 인덱스를 감싸므로, 목록의 '끝'에 맨 왼쪽 열을 배치합니다.
        # 이렇게 하면 인덱싱되지 않은 문자열과 일치하고 추가 계산이 저장됩니다.
        twoago, oneago, thisrow = (oneago, thisrow, [0] * len(seq2) + [x+1])
        for y in range(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneage[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # 이 블록은 대체를 처리합니다.
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]

In [5]:
class SymSpell:
    def __init__(self, max_edit_distance=3, verbose=0):
        self.max_edit_distance = max_edit_distance
        self.verbose = verbose
        # 0: 최고의 제안
        # 1: 가장 작은 거리의 모든 제안
        # 2: 모든 제안 <= max_edit_distance (느림, 조기 대체 없음)
        self.dictionary = {}
        self.longest_word_length = 0
        
    def get_deletes_list(self, w):
        """단어가 주어지면 max_edit_distance 문자가 삭제된 문자열을 파생합니다."""
        
        deletes = []
        queue = [w]
        for d in range(self.max_edit_distance):
            temp_queue = []
            for word in queue:
                if len(word) > 1:
                    for c in range(len(word)): # 문자 인덱스
                        word_minus_c = word[:c] + word[c + 1:]
                        if word_minus_c not in deletes:
                            deletes.append(word_minus_c)
                        if word_minus_c not in temp_queue:
                            temp_queue.append(word_minus_c)
            queue = temp_queue
        return deletes
    
    def create_dictionary_entry(self, w):
        """단어와 단어에서 파생된 삭제 내용을 딕셔너리에 추가합니다."""
        # 단어가 딕셔너리에 이미 존재하는지 체크합니다.
        # 딕셔너리 항목은 다음과 같은 형식으로 이루어져 있습니다.
        # (제안된 수정 목록, 말뭉치의 단어 빈도)
        new_real_word_added = False
        if w in self.dictionary:
            # 단어가 말뭉치 내에 있으면 카운트 증가
            self.dictionary[w] = (self.dictionary[w][0], self.dictionary[w][1] + 1)
        else:
            self.dictionary[w] = ([], 1)
            self.longest_word_length = max(self.longest_word_length, len(w))

        if self.dictionary[w][1] == 1:
            # 말뭉치에서 단어의 첫 등장입니다.
            # n.b 단어는 파생된 단어로 이미 딕셔너리에 있을 수 있습니다.
            # (실제 단어에서 문자를 추출합니다.)
            # 그러나 말뭉치에서 단어의 빈도수는 증가하지 않았습니다(이 경우)
            new_real_word_added = True
            deletes = self.get_deletes_list(w)
            for item in deletes:
                if item in self.dictionary:
                    # 삭제가 제안된 수정 목록에 단어를 추가합니다.
                    self.dictionary[item][0].append(w)
                else:
                    # 말뭉치의 단어 빈도는 증가하지 않습니다.
                    self.dictionary[item] = ([w], 0)
        return new_real_word_added

    def create_dictionary_from_arr(self, arr, token_pattern=r'[a-z]+'):
        total_word_count = 0
        unique_word_count = 0

        for line in arr:
            # 단어로, 비문자로 구분합니다.
            words = re.findall(token_pattern, line.lower())
            for word in words:
                total_word_count += 1
                if self.create_dictionary_entry(word):
                    unique_word_count += 1
        print('total words processed: %i' % total_word_count)
        print('total unique words in corpus: %i' % unique_word_count)
        print('total items in dictionary (corpus words and deletions): %i' 
              % len(self.dictionary))
        print('    edit distance for deletions: %i' % self.max_edit_distance)
        print('    length of longest word in corpus: %i' % self.longest_word_length)
        return self.dictionary

    def create_dictionary(self, fname):
        total_word_count = 0
        unique_word_count = 0
        
        with open(fname) as file:
            for line in file:
                # 비문자, 단어로 구분
                words = re.findall('[a-z]+', line.lower())
                for word in words:
                    total_word_count += 1
                    if self.create_dictionary_entry(word):
                        unique_word_count += 1
                        
        print('total words processed: %i' % total_word_count)
        print('total unique words in corpus: %i' % unique_word_count)
        print('total items in dictionary (corpus words and deletions): %i' 
              % len(self.dictionary))
        print('    edit distance for deletions: %i' % self.max_edit_distance)
        print('    length of longest word in corpus: %i' % self.longest_word_length)
        return self.dictionary
    
    def get_suggestions(self, string, silent=False):
        """철자가 잘못되었을 가능성이 있는 단어에 대해 제안된 수정 목록을 반환합니다."""
        if (len(string) - self.longest_word_length) > self.max_edit_distance:
            if not silent:
                print('no items in dictionary within maximum edit distance')
            return []
        
        suggest_dict = {}
        min_suggest_len = float('inf')
        
        queue = [string]
        q_dictionary = {} # 체크한 문자열 이외의 항목입니다.
        
        while len(queue) > 0:
            q_item = queue[0] # pop
            queue = queue[1:]
            
            # early exit
            if ((self.verbose < 2) and (len(suggest_dict) > 0) and
                ((len(string) - len(q_item)) > min_suggest_len)):
                break
                
            # process queue item
            if (q_item in self.dictionary) and (q_item not in suggest_dict):
                if self.dictionary[q_item][1] > 0:
                    # 단어는 딕셔너리에 있고, 말뭉치에서 온 단어이고, 아직 제안 목록에
                    # 없는 단어이므로 제안 딕셔너리에 값을 가진 단어에 의해
                    # 색인화(말뭉치 빈도, 편집 거리)됩니다.
                    # 삭제만 추가되므로(수동 딕셔너리 수정이 추가되지 않는 한) 입력 문자열이 아닌
                    #  q_items는 입력 문자열보다 짧습니다.
                    assert len(string) >= len(q_item)
                    suggest_dict[q_item] = (self.dictionary[q_item][1],
                                            len(string) - len(q_item))
                    
                    # early exit
                    if (self.verbose < 2) and (len(string) == len(q_item)):
                        break
                    elif (len(string) - len(q_item)) < min_suggest_len:
                        min_suggest_len = len(string) - len(q_item)
                        
                # 딕셔너리에 저장된 q_item에 대해 제안된 수정 사항(q_item 자체가
                # 유효한 단어인지 아니면 단순히 삭제인지 여부)은 유효한
                # 수정 사항이 될 수 있습니다.
                for sc_item in self.dictionary[q_item][0]:
                    if sc_item not in suggest_dict:
                        
                        # 계산된 편집 거리 제안 항목은 항상 더 길어야 합니다.
                        # (수동 수정이 추가되지 않는 한)
                        assert len(sc_item) > len(q_item)
                        
                        # 입력하지 않은 q_item은 원래 문자열보다 짧아야 합니다.
                        # (수동 수정이 추가되지 않는 경우)
                        assert len(q_item) <= len(string)
                        
                        if len(q_item) <= len(string):
                            assert q_item == string
                            item_dist = len(sc_item) - len(q_item)
                            
                        # 제안 목록의 항목은 문자열 자체와 동일하면 안됩니다.
                        assert sc_item != string
                        
                        # 예를 들어, Damerau-Levenshtein 거리를 사용하여 편집 거리를 계산합니다.
                        item_dist = dameraulevenshtein(sc_item, string)
                        
                        # verbose가 켜져 있지 않으면 편집 거리가 더 큰 단어를 추가하지 마십시오
                        if (self.verbose < 2) and (item_dist > min_suggest_len):
                            pass
                        elif item_dist <= self.max_edit_distance:
                            # 제안 목록에 있는 경우 이미 사전에 있어야 합니다.
                            assert sc_item in self.dictionary 
                            suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist)
                            if item_dist < min_suggest_len:
                                min_suggest_len = item_dist
                        # 단어가 처리되는 순서에 따라 다른 편집 거리의 단어들이 제안으로
                        # 입력될 수 있습니다. verbose가 설정되지 않은 경우
                        # 제안 딕셔너리를 트리밍 합니다.
                        if self.verbose < 2:
                            suggest_dict = {k: v for k, v in suggest_dict.items()
                                            if v[1] <= min_suggest_len}
                            
            # 이제 대기열 항목에서 삭제(예: 문자열의 하위 문자열 또는 삭제)를 확인할 추가 항목
            # 으로 생성합니다. - 대기열 끝에 추가
            assert len(string) >= len(q_item)

            # verbose가 켜져 있지 않은 경우 더 큰 편집 거리 단어를 추가하지 마십시오
            if (self.verbose < 2) and ((len(string) - len(q_item)) > min_suggest_len):
                pass
            elif (len(string) - len(q_item)) < self.max_edit_distance and len(q_item) > 1:
                for c in range(len(q_item)): # 문자 인덱스
                    if word_minus_c not in q_dictionary:
                        queue.append(word_minus_c)
                        q_dictionary[word_minus_c] = None # 임의의 값,단지 확인했다는 것을 증명
        
        # 이제 대기열이 비어 있습니다. 딕셔너리의 제안을 출력할 목록으로 변환합니다.
        if not silent and self.verbose != 0:
            print('num. of possible corrections: %i' % len(suggest_dict))
            print('  edit distance for deletions: %i' % self.max_edit_distance)
            
        # output option 1
        # 결과를 편집거리의 오름차순으과 빈도의 내림차 순으로 정렬
        # 수정된 단어의 제안 목록 반환:
        # return sorted(suggest_dict, key = lambda x:
        #               (suggest_dict[x][1], -suggest_dict[x][0]))

        # output option 2
        # return list of suggestions with (correction,
        #                                  (frequency in corpus, edit distance)):
        as_list = suggest_dict.items()
        outlist = sorted(as_list, key=lambda x: (x[1][1], -x[1][0]))
        
        if self.verbose == 0:
            return outlist[0]
        else:
            return outlist
        
        '''
        Option 1:
        ['file', 'five', 'fire', 'fine', ...]

        Option 2:
        [('file', (5, 0)),
         ('five', (67, 1)),
         ('fire', (54, 1)),
         ('fine', (17, 1))...]  
        '''
    
    def best_word(self, s, silent=False):
        try:
            return self.get_suggestions(s, silent)[0]
        except:
            return None
                    
                    
                            

In [6]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field, start_time=time()):
        self.field = field
        self.start_time = start_time
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, dataframe):
        print(f'[{time()-self.start_time}] select {self.field}')
        dt = dataframe[self.field].dtype
        if is_categorical_dtype(dt):
            return dataframe[self.field].cat.codes[:,None]
        elif is_numeric_dtype(dt):
            return dataframe[self.field][:,None]
        else:
            return dataframe[self.field]

In [15]:
class DropColumnsByDf(BaseEstimator, TransformerMixin):
    def __init__(self, min_df=1, max_df=1.0):
        self.min_df = min_df
        self.max_df = max_df
        
    def fit(self, X, y=None):
        m = X.tocsc()
        self.nnz_cols = ((m != 0).sum(axis=0) >= self.min_df).A1
        if self.max_df < 1.0:
            max_df = m.shape[0] * self.max_df
            self.nnz_cols = self.nnz_cols & ((m != 0).sum(axis=0) <= max_df).A1
        return self
    
    def transform(self, X, y=None):
        m = X.tocsc()
        return m[:, self.nnz_cols]

In [8]:
def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

In [9]:
def split_cat(text):
    try:
        cats = text.split('/')
        return cats[0], cats[1], cats[2], cats[0] + '/' + cats[1]
    except:
        print('no category')
        return 'other', 'other', 'other', 'other/other'

In [10]:
def brands_filling(dataset):
    vc = dataset['brand_name'].value_counts()
    brands = vc[vc > 0].index
    brand_word = r"[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+"
    
    many_w_brands = brands[brands.str.contains(' ')]
    one_w_brands = brands[~brands.str.contains(' ')]
    
    ss2 = SymSpell(max_edit_distance=0)
    ss2.create_dictionary_from_arr(many_w_brands, token_pattern=r'.+')
    
    ss1 = SymSpell(max_edit_distance=0)
    ss1.create_dictionary_from_arr(one_w_brands, token_pattern=r'.+')
    
    two_words_re = re.compile(r"(?=(\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+))")
    
    def find_in_str_ss2(row):
        for doc_word in two_words_re.finditer(row):
            print(doc_word)
            suggestion = ss2.best_word(doc_word.group(1), silent=True)
            if suggestion is not None:
                return doc_word.group(1)
        return ''
    
    def find_in_list_ss1(list):
        for doc_word in list:
            suggestion = ss1.best_word(doc_word, silent=True)
            if suggestion is not None:
                return doc_word
        return ''
    
    def find_in_list_ss2(list):
        for doc_word in list:
            suggestion = ss2.best_word(doc_word, silent=True)
            if suggestion is not None:
                return doc_word
        return ''
    
    print(f"Before empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")
    
    n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(
        pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+")
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_name]

    n_desc = dataset[dataset['brand_name'] == '']['item_description'].str.findall(
        pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+")
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_desc]

    n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(pat=brand_word)
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in n_name]

    desc_lower = dataset[dataset['brand_name'] == '']['item_description'].str.findall(pat=brand_word)
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in desc_lower]

    print(f"After empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")
    
    del ss1, ss2
    gc.collect()

In [11]:
def preprocess_regex(dataset, start_time=time()):
    karats_regex = r'(\d)([\s-]?)(karat|karats|carat|carats|kt)([^\w])'
    karats_repl = r'\1k\4'
    
    unit_regex = r'(\d+)[\s-]([a-z]{2})(\s)'
    unit_repl = r'\1\2\3'
    
    dataset['name'] = dataset['name'].str.replace(karats_regex, karats_repl)
    dataset['item_description'] = dataset['item_description'].str.replace(karats_regex, karats_repl)
    print(f'[{time() - start_time}] Karats normalized.')

    dataset['name'] = dataset['name'].str.replace(unit_regex, unit_repl)
    dataset['item_description'] = dataset['item_description'].str.replace(unit_regex, unit_repl)
    print(f'[{time() - start_time}] Units glued.')
    
    

In [12]:
def preprocess_pandas(train, test, start_time=time()):
    train = train[train.price > 0.0].reset_index(drop=True)
    print('Train shape without zero price: ', train.shape)
    
    nrow_train = train.shape[0]
    y_train = np.log1p(train['price'])
    merge: pd.DataFrame = pd.concat([train, test])
        
    del train
    del test
    gc.collect()
    
    merge['has_category'] = (merge['category_name'].notnull()).astype('category')
    print(f'[{time() - start_time}] Has_category filled.')
    
    merge['category_name'] = merge['category_name'] \
        .fillna('other/other/other') \
        .str.lower() \
        .astype(str)
    merge['general_cat'], merge['subcat_1'], merge['subcat_2'], merge['gen_subcat1'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    print(f'[{time() - start_time}] Split categories completed.')

    merge['has_brand'] = (merge['brand_name'].notnull()).astype('category')
    print(f'[{time() - start_time}] Has_brand filled.')

    merge['gencat_cond'] = merge['general_cat'].map(str) + '_' + merge['item_condition_id'].astype(str)
    merge['subcat_1_cond'] = merge['subcat_1'].map(str) + '_' + merge['item_condition_id'].astype(str)
    merge['subcat_2_cond'] = merge['subcat_2'].map(str) + '_' + merge['item_condition_id'].astype(str)
    print(f'[{time() - start_time}] Categories and item_condition_id concancenated.')

    merge['name'] = merge['name'] \
        .fillna('') \
        .str.lower() \
        .astype(str)
    merge['brand_name'] = merge['brand_name'] \
        .fillna('') \
        .str.lower() \
        .astype(str)
    merge['item_description'] = merge['item_description'] \
        .fillna('') \
        .str.lower() \
        .replace(to_replace='No description yet', value='')
    print(f'[{time() - start_time}] Missing filled.')

    preprocess_regex(merge, start_time)

    brands_filling(merge)
    print(f'[{time() - start_time}] Brand name filled.')

    merge['name'] = merge['name'] + ' ' + merge['brand_name']
    print(f'[{time() - start_time}] Name concancenated.')

    merge['item_description'] = merge['item_description'] \
                                + ' ' + merge['name'] \
                                + ' ' + merge['subcat_1'] \
                                + ' ' + merge['subcat_2'] \
                                + ' ' + merge['general_cat'] \
                                + ' ' + merge['brand_name']
    print(f'[{time() - start_time}] Item description concatenated.')

    merge.drop(['price', 'test_id', 'train_id'], axis=1, inplace=True)

    return merge, y_train, nrow_train

In [13]:
def intersect_drop_columns(train: csr_matrix, valid: csr_matrix, min_df=0):
    t = train.tocsc()
    v = valid.tocsc()
    nnz_train = ((t != 0).sum(axis=0) >= min_df).A1
    nnz_valid = ((v != 0).sum(axis=0) >= min_df).A1
    nnz_cols = nnz_train & nnz_valid
    res = t[:, nnz_cols], v[:, nnz_cols]
    return res

In [17]:
if __name__ == '__main__':
    
    
    start_time = time()
    
    train = pd.read_table(os.path.join(INPUT_PATH, 'train.tsv'),
                          engine='c',
                          dtype={'item_condition_id': 'category',
                                 'shipping': 'category'})
    test = pd.read_table(os.path.join(INPUT_PATH, 'test.tsv'),
                          engine='c',
                          dtype={'item_condition_id': 'category',
                                 'shipping': 'category'})
    print(f'[{time() - start_time}] Finished to load data')
    print('Train shape:', train.shape)
    print('Test shape:', test.shape)
    
    submission: pd.DataFrame = test[['test_id']]
        
    merge, y_train, nrow_train = preprocess_pandas(train, test, start_time)
    
    meta_params = {'name_ngram': (1, 2),
                   'name_max_f': 75000,
                   'name_min_df': 10,
                  
                   'category_ngram': (2, 3),
                   'category_token': '.+',
                   'category_min_df': 10,
                  
                   'brand_min_df': 10,
                  
                   'desc_ngram': (1, 3),
                   'desc_max_f': 150000,
                   'desc_max_df': 0.5,
                   'desc_min_df': 10}
    
    stopwords = frozenset(['the', 'a', 'an', 'is', 'it', 'this'])
    
    vectorizer = FeatureUnion([
        ('name', Pipeline([
            ('select', ItemSelector('name', start_time=start_time)),
            ('transform', HashingVectorizer(
                ngram_range=(1, 2),
                n_features=2 ** 27,
                norm='l2',
                lowercase=False,
                stop_words=stopwords
            )),
            ('drop_cols', DropColumnsByDf(min_df=2))
        ])),
        ('category_name', Pipeline([
            ('select', ItemSelector('category_name', start_time=start_time)),
            ('transform', HashingVectorizer(
                ngram_range=(1, 1),
                token_pattern='.+',
                tokenizer=split_cat,
                n_features=2 ** 27,
                norm='l2',
                lowercase=False
            )),
            ('drop_cols', DropColumnsByDf(min_df=2))
        ])),
        ('brand_name', Pipeline([
            ('select', ItemSelector('brand_name', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('gencat_cond', Pipeline([
            ('select', ItemSelector('gencat_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('subcat_1_cond', Pipeline([
            ('select', ItemSelector('subcat_1_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('subcat_2_cond', Pipeline([
            ('select', ItemSelector('subcat_2_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('has_brand', Pipeline([
            ('select', ItemSelector('has_brand', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('shipping', Pipeline([
            ('select', ItemSelector('shipping', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('item_condition_id', Pipeline([
            ('select', ItemSelector('item_condition_id', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('item_description', Pipeline([
            ('select', ItemSelector('item_description', start_time=start_time)),
            ('hash', HashingVectorizer(
                ngram_range=(1, 3),
                n_features=2 ** 27,
                dtype=np.float32,
                norm='l2',
                lowercase=False,
                stop_words=stopwords
            )),
            ('drop_cols', DropColumnsByDf(min_df=2)),
        ]))
    ], n_jobs=1)
    
    sparse_merge = vectorizer.fit_transform(merge)
    print(f'[{time() - start_time}] Merge vectorized')
    print(sparse_merge.shape)
    
    tfidf_transformer = TfidfTransformer()
    
    X = tfidf_transformer.fit_transform(sparse_merge)
    print(f'[{time() - start_time}] TF/IDF completed')
    
    X_train = X[:nrow_train]
    print(X_train.shape)
    
    X_test = X[nrow_train:]
    del merge
    del sparse_merge
    del vectorizer
    del tfidf_transformer
    gc.collect()
    
    X_train, X_test = intersect_drop_columns(X_train, X_test, min_df=1)
    print(f'[{time() - start_time}] Drop only in train or test cols: {X_train.shape[1]}')
    gc.collect()
    
    ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200,
                  normalize=False, tol=0.01)
    ridge.fit(X_train, y_train)
    print(f'[{time() - start_time}] Train Ridge completed. Iterations: {ridge.n_iter_}')
    
    predsR = ridge.predict(X_test)
    print(f'[{time() - start_time}] Predict Ridge completed.')
    
    submission.loc[:, 'price'] = np.expm1(predsR)
    submission.loc[submission['price'] < 0.0, 'price'] = 0.0
    submission.to_csv('submission_ridge.csv', index=False)

[7.369909048080444] Finished to load data
Train shape: (1482535, 8)
Test shape: (693359, 7)
Train shape without zero price:  (1481661, 8)
[8.373831510543823] Has_category filled.
[15.09694504737854] Split categories completed.
[15.187026977539062] Has_brand filled.
[17.884315729141235] Categories and item_condition_id concancenated.
[20.90005850791931] Missing filled.


  
  if __name__ == '__main__':


[30.556979417800903] Karats normalized.


  if sys.path[0] == '':
  del sys.path[0]


[45.33271527290344] Units glued.
total words processed: 2671
total unique words in corpus: 2671
total items in dictionary (corpus words and deletions): 2671
    edit distance for deletions: 0
    length of longest word in corpus: 39
total words processed: 2616
total unique words in corpus: 2616
total items in dictionary (corpus words and deletions): 2616
    edit distance for deletions: 0
    length of longest word in corpus: 15
Before empty brand_name: 927861
After empty brand_name: 252719
[85.92554974555969] Brand name filled.
[86.42800664901733] Name concancenated.
[91.64425301551819] Item description concatenated.
[92.28683829307556] select name
[115.38197994232178] select category_name




[123.6528103351593] select brand_name
[128.11686968803406] select gencat_cond
[132.60345339775085] select subcat_1_cond
[137.1613736152649] select subcat_2_cond
[141.752051115036] select has_brand
[141.84513592720032] select shipping
[141.93822121620178] select item_condition_id


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


[142.04631900787354] select item_description
[315.00609278678894] Merge vectorized
(2175020, 8961796)
[365.3662791252136] TF/IDF completed
(1481661, 8961796)
[390.54031229019165] Drop only in train or test cols: 5976503
[1087.6648752689362] Train Ridge completed. Iterations: None
[1087.9040923118591] Predict Ridge completed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [18]:
submission

Unnamed: 0,test_id,price
0,0,7.786888
1,1,9.161285
2,2,44.922333
3,3,13.125871
4,4,9.971759
...,...,...
693354,693354,19.485424
693355,693355,25.444056
693356,693356,7.504093
693357,693357,13.236767
