- competition/dataset : [https://www.kaggle.com/c/mercari-price-suggestion-challenge](https://www.kaggle.com/c/mercari-price-suggestion-challenge)
- date : 2021/03/19
- original : [https://www.kaggle.com/rumbok/ridge-lb-0-41944](https://www.kaggle.com/rumbok/ridge-lb-0-41944)

## Ridge (LB 0.41943)

**✏ 필사 1회** 

In [64]:
import multiprocessing as mp
import pandas as pd
from time import time
from scipy.sparse import csr_matrix
import os
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import gc
from sklearn.base import BaseEstimator, TransformerMixin
import re
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

**os.environ**  
문자열 환경을 나타내는 매핑 객체  
환경을 조회하는 것뿐만 아니라 환경을 수정하는 데도 사용될 수 있음

In [65]:
os.environ['MKL_NUM_THREDS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['JOBLIB_START_METHOD'] = 'forkserver'

INPUT_PATH = 'data'

In [66]:
def Damerau_Levenshtein(seq1, seq2):
    '''
    두 시퀀스 사이의 Damerau-Levenshtein 거리를 계산하는 함수
    Source: http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
    
    이 거리는 첫 번째 시퀀스를 두 번째 시퀀스로 변환하는데 필요한 추가, 삭제, 대체,
    전환 횟수를 의미함.
    일반적으로 문자열과 함께 사용되긴 하나, 유사한 객체들의 시퀀스에도 작동됨.
    
    전환은 연속적인 문자열의 교환이며, 다른 것들은 자체적으로 수행됨.
    
    이 함수는 실행하는데 O(N*M)의 시간복잡도와 O(M)의 공간복잡도를 가짐.
    (N과 M은 두 시퀀스의 길이)
    
    >>> Demerau_Levenshtein('ba', 'abc')
    2
    >>> Demerau_Levenshtein('fee', 'deed')
    2
    
    * 임의의 순서에도 작동함
    >>> Demerau_Levenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    '''
    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # len(seq1) + len(seq2) + 1 matrix에 기초함
    # 현재 행과 이전 두 행이 동시에 필요함
    oneago = None
    thisrow = list(range(1, len(seq2)+1)) + [0]
    for x in range(len(seq1)):
        twoago, oneago, thisrow = (oneago, thisrow, [0] * len(seq2) + [x+1])
        for y in range(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y-1] + 1
            subcost = oneago[y-1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # transposition(전환)
            if(x > 0 and y > 0 and seq1[x] == seq2[y-1]
               and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y-2] + 1)
    
    return thisrow[len(seq2) - 1]

In [67]:
class SymSpell:
    def __init__(self, max_edit_distance=3, verbose=0):
        self.max_edit_distance = max_edit_distance
        self.verbose = verbose
        # 0: top suggestion
        # 1: all sugestion of smallest edit distance
        # 2: all suggestions <= max_edit_distance (slower, no early termination)
        
        self.dictionary = {}
        self.longest_word_length = 0
    
    def get_deletes_list(self, w):
        '''
        단어를 입력하면 max_edit_distance 문자가 삭제된 문자열 리턴
        '''
        deletes = []
        queue = [w]
        for d in range(self.max_edit_distance):
            temp_queue = []
            for word in queue:
                if len(word) > 1:
                    for c in range(len(word)):
                        word_minus_c = word[:c] + word[c+1:]
                        if word_minus_c not in deletes:
                            deletes.append(word_minus_c)
                        if word_minus_c not in temp_queue:
                            temp.queue.append(word_minus_c)
            queue = temp_queue
        
        return deletes
    
    def create_dictionary_entry(self, w):
        '''
        단어 편집 및 딕셔너리에 삭제 실행
        '''
        # 단어가 딕셔너리에 있는지 확인
        # 딕셔너리 형태: (suggested correction 목록, 말뭉치 내 단어 빈도수)
        new_real_word_added = False
        if w in self.dictionary:
            # 말뭉치에서 단어의 증가 수
            self.dictionary[w] = (self.dictionary[w][0], self.dictionary[w][1] + 1)
        else:
            self.dictionary[w] = ([], 1)
            self.longest_word_length = max(self.longest_word_length, len(w))
        
        if self.dictionary[w][1] == 1:
            # 말뭉치에서 처음 등장하는 단어
            # 이미 파생된 단어로 딕셔너리에 들어있을 수 있음
            # (실제 단어에서 문자 삭제)
            # 말뭉치에서 단어의 빈도 수는 증가하지 않음
            new_real_word_added = True
            deletes = self.get_deletes_list(w)
            for item in deletes:
                if item in self.dictionary:
                    # delete의 suggested correction 목록에 추가
                    self.dictionary[item][0].append(w)
                else:
                    self.dictionary[item] = ([w], 0)
        
        return new_real_word_added
    
    def create_dictionary_from_arr(self, arr, token_pattern=r'[a-z]+'):
        total_word_count = 0
        unique_word_count = 0
        
        for line in arr:
            # 알파벳이 아닌 문자와 분리
            words = re.findall(token_pattern, line.lower())
            for word in words:
                total_word_count += 1
                if self.create_dictionary_entry(word):
                    unique_word_count += 1
        
        print('total words processed: %i'%total_word_count)
        print('total unique words in corpus: %i'%unique_word_count)
        print('total items in dictionary (corpus words and deletions): %i'%len(self.dictionary))
        print('  edit distance for deletions: %i'%self.max_edit_distance)
        print('  length of longest word in corpus: %i'%self.longest_word_length)
        
        return self.dictionary
    
    def create_dictionary(self, fname):
        total_word_count = 0
        unique_word_count = 0
        
        with open(fname) as file:
            for line in file:
                # 알파벳이 아닌 문자와 분리
                words = re.findall('[a-z]+', line.lower())
                for word in words:
                    total_word_count += 1
                    if self.create_dictionary_entry(word):
                        unique_word_count += 1
        
        print('total words processed: %i'%total_word_count)
        print('total unique words in corpus: %i'%unique_word_count)
        print('total items in dictinary (corpus words and deletions): %i'%len(self.dictionary))
        print('  edit distance for deletions: %i'%self.max_edit_distance)
        print('  length of longest word in corpus: %i'%self.longest_word_length)
        
        return self.dictionary
    
    def get_suggestions(self, string, silent=False):
        '''
        철자가 잘못되었을 수 있는 단어에 대한 suggested correction 리스트 리턴
        '''
        if (len(string) - self.longest_word_length) > self.max_edit_distance:
            if not silent:
                print('no items in dictionary within maximum edit distance')
            return []
        
        suggest_dict = {}
        min_suggest_len = float('inf')
        
        queue = [string]
        q_dictionary = {}  # 체크한 문자열 이외의 항목
        
        while len(queue) > 0:
            q_item = queue[0]
            queue = queue[1:]
            
            # early exit
            if ((self.verbose < 2) and (len(suggest_dict) > 0)) and\
                ((len(string) - len(q_item) > min_suggest_len)):
                    break
            
            # process queue item
            if (q_item in self.dictionary) and (q_item not in suggest_dict):
                if self.dictionary[q_item][1] > 0:
                    # 말뭉치에서 온 단어가 딕셔너리에는 있고 제안 리스트에는 없으면
                    # 단어를 value (말뭉치에서의 빈도수, 편집거리)와 함께 인덱싱
                    # 입력 문자열이 아닌 q_item은 삭제가 추가되었으므로
                    # 입력 문자열보다 길이가 짧아야 함
                    assert len(string) >= len(q_item)
                    suggest_dict[q_item] = (self.dictionary[q_item][1], len(string) - len(q_item))
                    
                    # early exit
                    if (self.verbose < 2) and (len(string) == len(q_item)):
                        break
                    elif (len(string) - len(q_item)) < min_suggest_len:
                        min_suggest_len = len(string) - len(q_item)
                
                # 딕셔너리에 저장된 q_item의 suggested correction은
                # valid correction으로 사용 가능
                for sc_item in self.dictionary[q_item][0]:
                    if sc_item not in suggest_dict:
                        
                        # 편집 거리 계산
                        # suggested item은 항상 더 길어야함
                        assert len(sc_item) > len(q_item)
                        
                        # 입력이 아닌 q_item은 원래 문자열보다 짧아야 함
                        assert len(q_item) <= len(string)
                        
                        if len(q_item) == len(string):
                            assert q_item == string
                            item_dist = len(sc_item) - len(q_item)
                        
                        # suggestion 리스트에 있는 아이템은 자신의 문자열과
                        # 길이가 같아서는 안됨
                        assert sc_item != string
                        
                        # Damerau-Levenshtine 거리를 이용하여 편집거리 계산
                        item_dist = Damerau-Levenshtein(sc_item, string)
                        
                        # verbose가 설정되지 않은 경우 편집거리가 더 큰 문자는 추가x
                        if (self.verbose < 2) and (item_dist > min_suggest_len):
                            pass
                        elif item_dist <= self.max_edit_distance:
                            # suggestion 리스트에 있으면 딕셔너리에도 있어야 함
                            assert sc_item in self.dicitionary
                            suggest_dict[sc_item] = (self(dictionary[sc_item][1], item_dist))
                            if item_dist < min_suggest_len:
                                min_suggest_len = item_dist
                            
                        # 단어가 처리된 순서에 따라 편집거리가 다른 일부 단어들은
                        # suggestion에 들어감
                        # verbose가 설정되지 않은 경우 suggestion 딕셔너리 절삭
                        if self.verbose < 2:
                            suggest_dict = {k: v for k, v in suggest_dict.items()
                                            if v[1] <= min_suggest_len}
                        
            # queue item으로부터 체크를 위한 추가적인 item으로 delete 생성
            assert len(string) >= len(q_item)
            
            # verbose가 설정되지 않은 경우 더 큰 편집거리를 가진 단어 추가x
            if (self.verbose < 2) and ((len(string) - len(q_item)) > min_suggest_len):
                pass
            elif (len(string) - len(q_item)) < self.max_edit_distance and len(q_item) > 1:
                for c in range(len(q_item)):
                    word_minus_c = q_item[:c] + q_item[c+1:]
                    if word_minus_c not in q_dictionary:
                        queue.append(word_minus_c)
                        q_dictionary[word_minus_c] = None
        # queue는 이제 빈 상태: suggestion 딕셔너리를 출력할 리스트로 변환
        if not silent and self.verbose != 0:
            print('number of possible corrections: %i'%len(suggest_dict))
            print('  edit distance for deletions: %i'%self.max_edit_distance)
        
        # output option 1
        # 편집거리에 대해 오름차순 정렬, 빈도수에 대해 내림차순 정렬
        # suggested word correction만 리턴
        # return sorted(suggest_dict, key=lambda x: (suggest_dict[x][1], -suggest_dict[x][0]))
        
        # output option 2
        # (correction, 말뭉치 빈도수, 편집거리) 형태의 suggestion 리스트 리턴
        as_list = suggest_dict.items()
        # outlist = sorted(as_list, key=lambda (term, (freq, dist)): (dist, -freq))
        outlist = sorted(as_list, key=lambda x: (x[1][1], -x[1][0]))
        
        if self.verbose == 0:
            return outlist[0]
        else:
            return outlist
        
        '''
        option 1:
        ['file', 'five', 'fire', 'fine', ...]
        
        option 2:
        [('file', (5, 0)),
         ('five', (67, 1)),
         ('fire', (54, 1)),
         ('fine', (17, 1)), ...]
        '''
    
    def best_word(self, s, silent=False):
        try:
            return self.get_suggestions(s, silent)[0]
        except:
            return None

In [68]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field, start_time=time()):
        self.field = field
        self.start_time = start_time
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, dataframe):
        print(f'[{time() - self.start_time}] select {self.field}')
        dt = dataframe[self.field].dtype
        if is_categorical_dtype(dt):
            return dataframe[self.field].cat.codes[:, None]
        elif is_numeric_dtype(dt):
            return dataframe[self.field][:, None]
        else:
            return dataframe[self.field]

In [69]:
class DropColumnsByDf(BaseEstimator, TransformerMixin):
    def __init__(self, min_df=1, max_df=1.0):
        self.min_df = min_df
        self.max_df = max_df
    
    def fit(self, x, y=None):
        m = x.tocsc()
        self.nnz_cols = ((m != 0).sum(axis=0) >= self.min_df).A1
        if self.max_df < 1.0:
            max_df = m.shape[0] * self.max_df
            self.nnz_cols = self.nnz_cols & ((m != 0).sum(axis=0) <= max_df).A1
        return self
    
    def transform(self, x, y=None):
        m = x.tocsc()
        return m[:, self.nnz_cols]

In [70]:
def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

In [71]:
def split_cat(text):
    try:
        cats = text.split('/')
        return cats[0], cats[1], cats[2], cats[0] + '/' + cats[1]
    except:
        print('no category')
        return 'other', 'other', 'other', 'other/other'

In [72]:
def brands_filling(dataset):
    vc = dataset['brand_name'].value_counts()
    brands = vc[vc > 0].index
    brand_word = r"[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+"
    
    many_w_brands = brands[brands.str.contains(' ')]
    one_w_brands = brands[~brands.str.contains(' ')]
    
    ss2 = SymSpell(max_edit_distance=0)
    ss2.create_dictionary_from_arr(many_w_brands, token_pattern=r'.+')
    
    ss1 = SymSpell(max_edit_distance=0)
    ss1.create_dictionary_from_arr(one_w_brands, token_pattern=r'.+')
    
    two_words_re = re.compile(r"(?=(\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+))")
    
    def find_in_str_ss2(row):
        for doc_word in two_words_re.finditer(row):
            print(doc_word)
            suggestion = ss2.best_word(doc_word.group(1), silent=True)
            if suggestion is not None:
                return doc_word.group(1)
        return ''
    
    def find_in_list_ss1(list):
        for doc_word in list:
            suggestion = ss1.best_word(doc_word, silent=True)
            if suggestion is not None:
                return doc_word
        return ''
    
    def find_in_list_ss2(list):
        for doc_word in list:
            suggestion = ss2.best_word(doc_word, silent=True)
            if suggestion is not None:
                return doc_word
        return ''
    
    print(f"Before empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")
    
    n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(
        pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+"
    )
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_name]
    
    n_desc = dataset[dataset['brand_name'] == '']['item_description'].str.findall(
        pat=r"^[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+\s[a-z0-9*/+\-'’?!.,|&%®™ôèéü]+"
    )
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss2(row) for row in n_desc]
    
    n_name = dataset[dataset['brand_name'] == '']['name'].str.findall(pat=brand_word)
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in n_name]

    desc_lower = dataset[dataset['brand_name'] == '']['item_description'].str.findall(pat=brand_word)
    dataset.loc[dataset['brand_name'] == '', 'brand_name'] = [find_in_list_ss1(row) for row in desc_lower]
    
    print(f"After empty brand_name: {len(dataset[dataset['brand_name'] == ''].index)}")
    
    del ss1, ss2
    gc.collect()

In [73]:
def preprocess_regex(dataset, start_time=time()):
    karats_regex = r'(\d)([\s-]?)(karat|karats|carat|carats|kt)([^\w])'
    karats_repl = r'\1k\4'
    
    unit_regex = r'(\d+)[\s-]([a-z]{2})(\s)'
    unit_repl = r'\1\2\3'
    
    dataset['name'] = dataset['name'].str.replace(karats_regex, karats_repl)
    dataset['item_description'] = dataset['item_description'].str.replace(karats_regex, karats_repl)
    print(f'[{time() - start_time}] Karats normalized.')
    
    dataset['name'] = dataset['name'].str.replace(unit_regex, unit_repl)
    dataset['item_description'] = dataset['item_description'].str.replace(unit_regex, unit_repl)
    print(f'[{time() - start_time}] Units glued.')

In [74]:
def preprocess_pandas(train, test, start_time=time()):
    train = train[train['price'] > 0.0].reset_index(drop=True)
    print('Train shape without zero price:', train.shape)
    
    nrow_train = train.shape[0]
    y_train = np.log1p(train['price'])
    merge: pd.DataFrame = pd.concat([train, test])
    
    del train, test
    gc.collect()
    
    merge['has_category'] = (merge['category_name'].notnull()).astype('category')
    print(f'[{time() - start_time}] Has_category filled.')
    
    merge['category_name'] = merge['category_name'].fillna('other/other/other').str.lower()
    merge['general_cat'], merge['subcat_1'], merge['subcat_2'], merge['gen_subcat1'] =\
    zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    print(f'[{time() - start_time}] Split categories completed.')
    
    merge['has_brand'] = (merge['brand_name'].notnull()).astype('category')
    print(f'[{time() - start_time}] Has_brand filled.')
    
    merge['gencat_cond'] = merge['general_cat'].map(str) + '_' + merge['item_condition_id'].astype(str)
    merge['subcat_1_cond'] = merge['subcat_1'].map(str) + '_' + merge['item_condition_id'].astype(str)
    merge['subcat_2_cond'] = merge['subcat_2'].map(str) + '_' + merge['item_condition_id'].astype(str)
    print(f'[{time() - start_time}] Categories and item_condition_id concatenated.')
    
    merge['name'] = merge['name'].fillna('').str.lower()
    merge['brand_name'] = merge['brand_name'].fillna('').str.lower()
    merge['item_description'] = merge['item_description'].fillna('').str.lower().replace(to_replace='No description yet', value='')
    print(f'[{time() - start_time}] Missing filled.')
    
    preprocess_regex(merge, start_time)
    
    brands_filling(merge)
    print(f'[{time() - start_time}] Brand name filled.')
    
    merge['name'] = merge['name'] + ' ' + merge['brand_name']
    print(f'[{time() - start_time}] Name concatenated')
    
    merge['item_description'] = merge['item_description']\
                                + ' ' + merge['name']\
                                + ' ' + merge['subcat_1']\
                                + ' ' + merge['subcat_2']\
                                + ' ' + merge['general_cat']\
                                + ' ' + merge['brand_name']
    print(f'[{time() - start_time}] Item description concatenated.')
    
    merge.drop(['price', 'test_id', 'train_id'], axis=1, inplace=True)
    
    return merge, y_train, nrow_train

In [75]:
def intersect_drop_columns(train: csr_matrix, valid: csr_matrix, min_df=0):
    t = train.tocsc()
    v = valid.tocsc()
    nnz_train = ((t != 0).sum(axis=0) >= min_df).A1
    nnz_valid = ((v != 0).sum(axis=0) >= min_df).A1
    nnz_cols = nnz_train & nnz_valid
    res = t[:, nnz_cols], v[:, nnz_cols]
    return res

In [76]:
if __name__ == '__main__':
    start_time = time()
    
    train = pd.read_table(os.path.join(INPUT_PATH, 'train.tsv'),
                          engine='c',
                          dtype={'item_condition_id':'category',
                                 'shipping':'category'})
    test = pd.read_table(os.path.join(INPUT_PATH, 'test.tsv'),
                          engine='c',
                          dtype={'item_condition_id':'category',
                                 'shipping':'category'})
    print(f'[{time() - start_time}] Finished to load data')
    print('Train shape:', train.shape)
    print('Test shape:', test.shape)
    
    submission: pd.DataFrame = test[['test_id']]
    
    merge, y_train, nrow_train = preprocess_pandas(train, test, start_time)
    
    meta_params = {
        'name_ngram':(1, 2),
        'name_max_f':75000,
        'name_min_df':10,
        
        'category_ngram':(2, 3),
        'category_token':'.+',
        'category_min_df':10,
        
        'brand_min_df':10,
        
        'desc_ngram':(1, 3),
        'desc_max_f':150000,
        'desc_max_df':0.5,
        'desc_min_df':10
    }
    
    stopwords = frozenset(['the', 'a', 'an', 'is', 'it', 'this'])
    # 'i', 'so', 'its', 'am', 'are'
    
    vectorizer = FeatureUnion([
        ('name',
         Pipeline([
             ('select', ItemSelector('name', start_time=start_time)),
             ('transform', HashingVectorizer(
                 ngram_range=(1, 2),
                 n_features=2**27,
                 norm='l2',
                 lowercase=False,
                 stop_words=stopwords
             )),
             ('drop_cols', DropColumnsByDf(min_df=2))
         ])),
        ('category_name',
         Pipeline([
             ('select', ItemSelector('category_name', start_time=start_time)),
             ('transform', HashingVectorizer(
                 ngram_range=(1, 1),
                 token_pattern='.+',
                 tokenizer=split_cat,
                 n_features=2**27,
                 norm='l2',
                 lowercase=False
             )),
             ('drop_cols', DropColumnsByDf(min_df=2))
         ])),
        ('brand_name',
         Pipeline([
             ('select', ItemSelector('brand_name', start_time=start_time)),
             ('transform', CountVectorizer(
                 token_pattern='.+',
                 min_df=2,
                 lowercase=False
             )),
         ])),
        ('gencat_cond',
          Pipeline([
            ('select', ItemSelector('gencat_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('subcat_1_cond',
          Pipeline([
            ('select', ItemSelector('subcat_1_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('subcat_2_cond',
          Pipeline([
            ('select', ItemSelector('subcat_2_cond', start_time=start_time)),
            ('transform', CountVectorizer(
                token_pattern='.+',
                min_df=2,
                lowercase=False
            )),
        ])),
        ('has_brand',
          Pipeline([
            ('select', ItemSelector('has_brand', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('shipping',
          Pipeline([
            ('select', ItemSelector('shipping', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('item_condition_id',
          Pipeline([
            ('select', ItemSelector('item_condition_id', start_time=start_time)),
            ('ohe', OneHotEncoder())
        ])),
        ('item_description',
          Pipeline([
            ('select', ItemSelector('item_description', start_time=start_time)),
            ('hash', HashingVectorizer(
                ngram_range=(1, 3),
                n_features=2 ** 27,
                dtype=np.float32,
                norm='l2',
                lowercase=False,
                stop_words=stopwords
            )),
            ('drop_cols', DropColumnsByDf(min_df=2)),
        ]))
    ], n_jobs=1)
    
    sparse_merge = vectorizer.fit_transform(merge)
    print(f'[{time() - start_time}] Merge vectorized')
    print(sparse_merge.shape)
    
    tfidf_transformer = TfidfTransformer()
    x = tfidf_transformer.fit_transform(sparse_merge)
    print(f'[{time() - start_time}] TF/IDF completed')
    
    x_train = x[:nrow_train]
    print(x_train.shape)
    
    x_test = x[nrow_train:]
    
    del merge, sparse_merge, vectorizer, tfidf_transformer
    gc.collect()
    
    x_train, x_test = intersect_drop_columns(x_train, x_test, min_df=1)
    print(f'[{time() - start_time}] Drop only in train or test cols: {x_train.shape[1]}')
    gc.collect()
    
    ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200,
                  normalize=False, tol=0.01)
    ridge.fit(x_train, y_train)
    print(f'[{time() - start_time}] Train Ridge completed. Iterations: {ridge.n_iter_}')
    
    predsR = ridge.predict(x_test)
    print(f'[{time() - start_time}] Predict Ridge completed.')
    
    submission.loc[:, 'price'] = np.expm1(predsR)
    submission.loc[submission['price'] < 0.0, 'price'] = 0.0
    submission.to_csv('data/submission_3_submission_ridge.csv', index=False)

[10.101563930511475] Finished to load data
Train shape: (1482535, 8)
Test shape: (693359, 7)
Train shape without zero price: (1481661, 8)
[11.673523664474487] Has_category filled.
[18.598693132400513] Split categories completed.
[18.722391605377197] Has_brand filled.
[21.615187883377075] Categories and item_condition_id concatenated.
[24.93429160118103] Missing filled.


  
  if __name__ == '__main__':


[41.0000057220459] Karats normalized.


  if sys.path[0] == '':
  del sys.path[0]


[62.87273335456848] Units glued.
total words processed: 2671
total unique words in corpus: 2671
total items in dictionary (corpus words and deletions): 2671
  edit distance for deletions: 0
  length of longest word in corpus: 39
total words processed: 2616
total unique words in corpus: 2616
total items in dictionary (corpus words and deletions): 2616
  edit distance for deletions: 0
  length of longest word in corpus: 15
Before empty brand_name: 927861
After empty brand_name: 252719
[125.69549107551575] Brand name filled.
[126.28608417510986] Name concatenated
[133.23039627075195] Item description concatenated.
[133.92292404174805] select name
[167.14917182922363] select category_name




[178.5758183002472] select brand_name
[184.75419211387634] select gencat_cond
[190.9366328716278] select subcat_1_cond
[197.15698981285095] select subcat_2_cond
[203.48507714271545] select has_brand
[203.61310505867004] select shipping


  del sys.path[0]
  del sys.path[0]


[203.74375653266907] select item_condition_id
[203.89841198921204] select item_description


  del sys.path[0]


[488.6556258201599] Merge vectorized
(2175020, 8961796)
[578.784182548523] TF/IDF completed
(1481661, 8961796)
[627.6966650485992] Drop only in train or test cols: 5976503
[1606.3656115531921] Train Ridge completed. Iterations: None
[1606.8883068561554] Predict Ridge completed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
