<a href="https://colab.research.google.com/github/JINYUHOON/JINYUHOON/blob/main/%EC%98%A8%EB%9D%BC%EC%9D%B8%EC%87%BC%ED%95%91%EB%AA%B0%EA%B0%80%EA%B2%A9%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os,sys
from google.colab import drive
drive.mount('/content/drive')
my_path = '/content/notebooks'
os.symlink('/content/drive/My Drive/Colab Notebooks',my_path)
sys.path.insert(0,my_path)

아래 사항을 고려하여 대형 온라인 쇼핑몰 제품 가격 예측을 수행하세요
- 제공되는 데이터 세트 mercari_train.csv는 제품이 여러 속성 및 제품 설명 등의 텍스트 데이터로 구성된다.
- 데이터 전처리
    - Null 데이터는 적절한 문자열로 치환
    - 데이터 분포도 확인 정규성 확보
    - item_description이 'No description yet' 인 경우 Null과 마찬가지로 적절한 값으로 변경
    - category_name이 Null이 아닌 경우 대 중 소 분류를 분리

- 피처 인코딩과 피처 벡터화
- 릿지 회귀 모델 구축 및 평가
- LightGBM 회귀 모델 구축과 앙상블을 이용한 최종 예측 평가

In [None]:
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np


mercari_df = pd.read_csv('/content/drive/MyDrive/cakd3 colab/textmining/dataset/mercari_train.tsv',sep='\t')

print(mercari_df.shape)
mercari_df.head()

In [None]:
mercari_df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

y_train_df = mercari_df['price']
plt.figure(figsize=(6,4))
sns.distplot(y_train_df)

In [None]:
y_train_df = np.log1p(y_train_df)
sns.distplot(y_train_df)

In [None]:
mercari_df['price'] = np.log1p(mercari_df['price'])

In [None]:
boolean_cond = mercari_df['item_description'] == 'No description yet'
mercari_df[boolean_cond]['item_description'].count()

In [None]:
def split_cat(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Other_Null','Other_Null','Othet_Null']

mercari_df['cat_dae'], mercari_df['cat_jung'], mercari_df['cat_so'] = zip(*mercari_df['category_name'].apply(lambda x : split_cat(x)))

In [None]:
print('대분류 유형:\n', mercari_df['cat_dae'].value_counts())
print('중분류 개수 : ',mercari_df['cat_jung'].nunique())
print('소분류 개수 : ',mercari_df['cat_so'].nunique())

In [None]:
mercari_df['brand_name'] = mercari_df['brand_name'].fillna(value='Othet_Null')
mercari_df['category_name'] = mercari_df['category_name'].fillna(value='Other_Null')
mercari_df['item_description'] = mercari_df['item_description'].fillna(value='Other_Null')
mercari_df.isnull().sum()

In [None]:
print('brand_name의 유형건수 : ',mercari_df['brand_name'].nunique())
print('brand_name의 샘플 5건 :\n', mercari_df['brand_name'].value_counts()[:5])

In [None]:
print('name의 종류 개수 :',mercari_df['name'].nunique())
print('name sample 10건: \n', mercari_df['name'][:10])

In [None]:
pd.set_option('max_colwidth',200)

print('item_description 평균 문자열 크기: ', mercari_df['item_description'].str.len().mean())

mercari_df['item_description'][:2]

In [None]:
cnt_vec = CountVectorizer()
X_name = cnt_vec.fit_transform(mercari_df.name)

tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1,3), stop_words='english')
X_descp = tfidf_descp.fit_transform(mercari_df['item_description'])

print('name vectorization shape :', X_name.shape)
print('item_description vetorization :', X_descp.shape)

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb_brand_name = LabelBinarizer(sparse_output=True)
X_brand = lb_brand_name.fit_transform(mercari_df['brand_name'])

lb_item_cond_id = LabelBinarizer(sparse_output=True)
X_item_cond_id = lb_item_cond_id.fit_transform(mercari_df['item_condition_id'])

lb_shipping = LabelBinarizer(sparse_output=True)
X_shipping = lb_shipping.fit_transform(mercari_df['shipping'])

lb_cat_dae = LabelBinarizer(sparse_output=True)
X_cat_dae = lb_cat_dae.fit_transform(mercari_df['cat_dae'])

lb_cat_dae = LabelBinarizer(sparse_output=True)
X_cat_dae = lb_cat_dae.fit_transform(mercari_df['cat_dae'])

lb_cat_jung = LabelBinarizer(sparse_output=True)
X_cat_jung = lb_cat_jung.fit_transform(mercari_df['cat_jung'])

lb_cat_so = LabelBinarizer(sparse_output=True)
X_cat_so = lb_cat_so.fit_transform(mercari_df['cat_so'])

In [None]:
from scipy.sparse import hstack
import gc

sparse_matrix_list = (X_name, X_descp, X_brand, X_item_cond_id, X_shipping, X_cat_dae, X_cat_jung, X_cat_so)

X_features_sparse = hstack(sparse_matrix_list).tocsr()

print(type(X_features_sparse), X_features_sparse.shape)

del X_features_sparse
gc.collect()

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y)- np.log1p(y_pred),2)))

def evaluate_org_price(y_test, preds):

    preds_exmpm = np.expm1(preds)
    y_test_exmpm = np.expm1(y_test)

    rmsle_result = rmsle(y_test_exmpm, preds_exmpm)
    return rmsle_result

In [None]:
import gc
from scipy.sparse import hstack

def model_train_predict(model, matrix_list):
    X = hstack(matrix_list).tocsr()

    X_train , X_test, y_train, y_test = train_test_split(X, mercari_df['price'],test_size=0.2, random_state=156)

    model.fit(X_train,y_train)
    preds = model.predict(X_test)

    del X, X_train, y_train
    gc.collect()

    return preds, y_test

In [None]:
linear_model = Ridge(solver='lsqr', fit_intercept=False)

sparse_matrix_list = (X_name, X_brand, X_item_cond_id, X_shipping, X_cat_dae, X_cat_jung, X_cat_so)

linear_preds , y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)

print('Item Description을 제외했을 때 rmsle 값 = ', evaluate_org_price(y_test, linear_preds))


sparse_matrix_list = (X_name, X_descp, X_brand, X_item_cond_id, X_shipping, X_cat_dae, X_cat_jung, X_cat_so)

linear_preds , y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)

print('Item Description을 포함했을 때 rmsle 값 = ', evaluate_org_price(y_test, linear_preds))

In [None]:
from lightgbm import LGBMRegressor

sparse_matrix_list = (X_name, X_descp, X_brand, X_item_cond_id, X_shipping, X_cat_dae, X_cat_jung, X_cat_so)

lgbm_model = LGBMRegressor(n_estimators= 200, learning_rate=0.5, nun_leaves=125, random_state=156)
lgbm_preds, y_test = model_train_predict(model = lgbm_model, matrix_list= sparse_matrix_list)

print('LightGBM rmsle값 : ', evaluate_org_price(y_test,lgbm_preds))

In [None]:
preds = lgbm_preds * 0.45 + linear_preds * 0.55

print('LightGBM 과 Ridge를  ensemble한 최종 rmlse: ', evaluate_org_price(y_test, preds) )