- Day 41 : 21/03/22

# LGB and FM

In [4]:
import pandas as pd
import numpy as np
import time
import gc
import string
import re

from nltk.corpus import stopwords
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection.univariate_selection import SelectKBest, f_regression
from sklearn.preprocessing import LabelBinarizer

#import wordbatch
#from wardbatch.extractors import WordBag
#from wordbatch.models import FM_FTRL

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb

In [5]:
def rmse(predicted, actual):
    return np.sqrt(((predicted - actual)**2).mean())

In [6]:
def split_cat(text):
    try:
        return text.split('/')
    except:
        return ('No Label','No Label','No Label')

In [7]:
class TargetEncoder:
    def __repr__(self):
        return 'TargetEncoder'
    
    def __init__(self, cols, smoothing=1, min_samples_leaf=1, noise_level=0, keep_original=False):
        self.cols = cols
        self.smoothing = smoothing
        self.min_samples_leaf = min_samples_leaf
        self.noise_level = noise_level
        self.keep_original = keep_original
        
    @staticmethod
    def add_noise(series, noise_level):
        return series * (1+noise_level * np.random.randn(len(series)))
    
    def encode(self, train, test, target):
        for col in self.cols:
            if self.keep_original:
                trin[col+'_te'], test[col+'_te'] = self.encode_column(train[col], test[col], target)
            else:
                train[col], test[col] = self.encode_column(train[col], test[col], target)
        return train, test
        
    def encode_column(self, trn_series, tst_series, target):
        temp = pd.concat([trn_series, target], axis=1)
        # target 평균 계산
        averages = temp.groupby(by=trn_series.name)[target.name].agg(['mean','count'])
        # smoothing 계산
        smoothing = 1 / (1+np.exp(-(average['count']-self.min_samples_leaf) / self.smoothing))
        # 모든 target 데이터에 평균 함수 적용
        prior = target.mean()
        # count 클수록 full_avg가 덜 고려됨
        averages[target.name] = prior*(1-smoothing) + averages['mean']*smoothing
        averages.drop(['mean','count'], axis=1, inplace=True)
        
        # trn, tst series에 평균 적용
        ft_trn_series = pd.merge(trn_series.to_frame(trn_series.name),
                                averages.reset_index().rename(columns={'index':target.name, target.name:'average'}),
                                on=trn_series.name, how='left')['average'].rename(trn_series.name+'_mean').fillna(prior)
        # pd.merge는 인덱스를 유지하지 않으므로 복구해줌
        ft_trn_series.index = trn_series.index
        
        ft_tst_series = pd.merge(tst_series.to_frame(tst_series.name),
                                averages.reset_index().rename(columns={'index':target.name, target.name:'average'}),
                                on=tst_series.name, how='left')['average'].rename(trn_series.name+'_mean').fillna(prior)
        # pd.merge는 인덱스 유지하지 않으므로 복구해줌
        ft_tst_series.index = tst_series.index
        
        return self.add_noise(ft_trn_series, self.noise_level), self.add_noise(ft_tst_series, self.noise_level)

In [8]:
def to_number(x):
    try :
        if not x.isdigit():
            return 0
        x = int(x)
        if x>100:
            return 100
        else :
            return x
    except:
        return 0

In [9]:
def sum_numbers(desc):
    if not isinstance(desc, str):
        return 0
    try:
        return sum([to_number(s) for s in desc.split()])
    except:
        return 0