In [2]:
# 기본 라이브러리
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import statsmodels.api as sm
import re
from sklearn import set_config
%matplotlib inline

# 시각화 및 폰트
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic') # 폰트 지정
plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
%config InlineBackend.figure_format='retina' # 그래프 글씨 뚜렷

# 전처리 관련 라이브러리
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer 
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SequentialFeatureSelector,SelectPercentile
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

# 훈련 및 평가 관련 라이브러리
from sklearn.model_selection import train_test_split, ShuffleSplit,cross_val_score, cross_validate, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score, precision_score, recall_score)

# 모델 라이브러리
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import xgboost as xgb

- data load

In [3]:
df_train = pd.read_csv("./data/train.csv") # 학습용 데이터
df_test = pd.read_csv("./data/submission.csv") # 테스트 데이터(제출파일의 데이터)
df = pd.concat([df_train,df_test])
# df_train = df[:59299]
# df_test = df[59299:]

- customer_country

In [4]:
# 전처리 : 국가 값으로 값 변경
df['customer_country'] = df['customer_country'].str.split('/').str[-1].str.strip()

- customer_type

In [5]:
# 전처리 : 중복 통합
df['customer_type'].replace({'End Customer':'End-Customer', 'Specifier / Influencer': 'Specifier/Influencer',
                             'Home Owner':'Homeowner', 'Etc.':'Others','Other':'Others','End-user':'End-Customer',
                             'Commercial end-user':'End-Customer','Software / Solution Provider':'Software/Solution Provider',
                             'Dealer/Distributor	':'Distributor'}, 
                            inplace=True)

- inquiry_type

In [6]:
# 전처리 : 중복 통합,오탈자 정리
df['inquiry_type'].replace({'Quotation or purchase consultation':'Quotation or Purchase Consultation',
                            'quotation_or_purchase_consultation':'Quotation or Purchase Consultation',
                            'Quotation or Purchase consultation':'Quotation or Purchase Consultation',
                            'Purchase or Quotation':'Quotation or Purchase Consultation',
                            'Others':'Other','other_':'Other','other':'Other','ETC.':'Other','Etc.':'Other','others':'Other',
                            'Usage or Technical Consultation':'Technical Consultation	',
                            'Usage or Technical Consultation':'Technical Consultation	',
                            'usage or technical consultation':'Technical Consultation	',
                            },
                            inplace=True)

- product_modelname

In [7]:
# 전처리 : 결측값 Anything 으로 대체
df['product_modelname'] = df['product_modelname'].fillna('Anything')
# 전처리 : 값에 국가 존재하면 값에서 제거 후 대체
df['product_modelname'] = df['product_modelname'].str.replace(r'\(NA\)|\(MEA\)|\(EU\)|\(EU/CIS\)|\(ASIA\)|\(INDIA\)|\(CIS\)|\(Colombia\)|\(SCA\)|\(EU Only\)|\(Brazil Only\)|\(LATAM\)|\(Japan\)', '', regex=True).str.strip()
# 전처리 : 모델명과 모델 코드 모두 존재하면 모델 코드로 대체
df['product_modelname'] = df['product_modelname'].replace({'UltraWide Ergo(34WN780)':'34WN780','UltraFine Ergo(32UN880)':'32UN880',
                                     'DualUp(28MQ780)':'28MQ780','Ergo Dual(27QP88D)':'27QP88D',
                                     '65EP5G OLED Pro':'65EP5G'})
# 전처리 : 같은 것을 지칭하는 경우
df['product_modelname'] = df['product_modelname'].replace({'B, 32HL512D':'32HL512D','Diagnostic Monitors':'Diagnostic Monitor','SuperSign CMS':'LG SuperSign CMS'})
# 전처리 : 모델명이 아닌 문장인 것들은 'other'로 대체
df['product_modelname'] = df['product_modelname'].replace(['Total Care Thru One-stop Service', 'Architect , We are Meeting for Enqiry Generation ( This is not a Inquiry)', 'Total Care Thru One', 'Due to budget they have hold the requiement', 'Required After 3 Months', 'Want Split AC', 'Only Installation Need', 'Passed on to Fixxy distribution', 'full', 'This is being dealt with by LG Germany.', 'SuperSign Media Editor', 'SuperSign WB', 'ALL Surgical', 'Surgical', 'diagnostic', 'LGESL Export team is follow up the lead', 'Video', 'Inquiry forwarded to Shaker', 'AI/Machine Learning | Antennas, Transmitters and Towers | Audience Measurement | Cameras and Lenses', 'One:Quick', 'Solution'], 'Other')

# 전처리 : '-' 다음의 띄어쓰기를 없애고 반환
def preprocess_modelname(modelname):
    return modelname.replace('- ', '')
df['product_modelname'] = df['product_modelname'].apply(preprocess_modelname)

- excepted_timeline

In [80]:
# 감정점수 데이터 로드
timeline_sentiment = pd.read_csv('./data/sentiment_df.csv', encoding='ISO-8859-1')
# 결측값 처리
timeline_sentiment['expected_timeline'] = timeline_sentiment['expected_timeline'].fillna(0)
# 기간값 수치형 변환
timeline_sentiment['expected_timeline'] = timeline_sentiment['expected_timeline'].replace({'3 months':3,'3 months ~ 6 months':4.5,'3_months_~_6_months':4.5,
                                                                                           '45 days':1.5,'6 months ~ 9 months':7.5,'6_months_~_9_months':7.5,
                                                                                           '9 months - 1 year':10.5,'9 months ~ 1 year':10.5,'9_months_-_1_year':10.5,
                                                                                           'more than a year':12,'more then 3 months':3,'more_than_a_year':12,
                                                                                           'less than 3 months':1.5,'less than 5 months':2.5,'less than 6 months':3,
                                                                                            'less then 6 months':3,'less_than_3_months':1.5 })
# 문자형 값일 경우 감정점수로 변환
timeline_sentiment['expected_timeline'] = timeline_sentiment.apply(lambda x: x['sentiment_score'] if isinstance(x['expected_timeline'], str) else x['expected_timeline'], axis=1)
# 기존 변수 변환
df['expected_timeline'] = timeline_sentiment['expected_timeline']
df['sentiment_score'] = timeline_sentiment['sentiment_score']

- business_area

In [8]:
# 전처리 : 최빈값 대체
df['business_area'] = df['business_area'].fillna('Others')

- business_subarea

In [None]:
# 전처리 : 최빈값 대체
df['business_subarea'] = df['business_subarea'].fillna('Others')