In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from gensim.models import Word2Vec
import tqdm
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [None]:
corp_to_country = {
    'LGEAF': 'Nigeria',
    'LGEAG': 'Austria',
    'LGEAP': 'Australia',
    'LGEAR': 'Argentina',
    'LGEAS': 'Algeria',
    'LGEBN': 'Netherlands',
    'LGEBT': 'Portugal',
    'LGECB': 'Colombia',
    'LGECH': 'China',
    'LGECI': 'Canada',
    'LGECL': 'Chile',
    'LGECZ': 'Czech',
    'LGEDG': 'Germany',
    'LGEIN': 'Indonesia',
    'LGEIR': 'Iran',
    'LGEIS': 'Italy',
    'LGEJP': 'Japan',
    'LGEKR': 'Korea',
    'LGELA': 'Latvia',
    'LGELF': 'Jordan',
    'LGEMC': 'Morocco',
    'LGEMK': 'Hungary',
    'LGEML': 'Malaysia',
    'LGEMS': 'Mexico',
    'LGEPH': 'Philippines',
    'LGEPL': 'Poland',
    'LGEPR': 'Peru',
    'LGEPS': 'Guatemala',
    'LGEPT': 'Portugal',
    'LGERA': 'Russia',
    'LGERO': 'Romania',
    'LGESA': 'South Africa',
    'LGESJ': 'Saudi Arabia',
    'LGESL': 'Singapore',
    'LGESP': 'Brazil',
    'LGETH': 'Thailand',
    'LGETK': 'Turkey',
    'LGETT': 'Taiwan',
    'LGEUK': 'United Kingdom',
    'LGEUR': 'Ukraine',
    'LGEUS': 'United States',
    'LGEVH': 'Vietnam',
    'LGEYK': 'Israel',
    'LGESW': 'Denmark',
    'LGEIL': 'India',
    'LGEGF': 'U.A.E',
    'LGEEG': 'Egypt',
    'LGEEF': 'Ethiopia',
    'LGEES': 'Spain',
    'LGEHK': 'Hong Kong',
    'LGEHS': 'Greece',
    'LGEFS': 'France',
    'LGEEB': 'Switzerland'
}

df_train['response_corporate_1'] = df_train['response_corporate'].map(corp_to_country)
df_test['response_corporate_1'] = df_test['response_corporate'].map(corp_to_country)

country_to_continent = {
    'Nigeria': 'Africa',
    'Austria': 'Europe',
    'Australia': 'Oceania',
    'Argentina': 'South America',
    'Algeria': 'Africa',
    'Netherlands': 'Europe',
    'Portugal': 'Europe',
    'Colombia': 'South America',
    'China': 'Asia',
    'Canada': 'North America',
    'Chile': 'South America',
    'Czech': 'Europe',
    'Germany': 'Europe',
    'Indonesia': 'Asia',
    'Iran': 'Asia',
    'Italy': 'Europe',
    'Japan': 'Asia',
    'Korea': 'Asia',
    'Latvia': 'Europe',
    'Jordan': 'Asia',
    'Morocco': 'Africa',
    'Hungary': 'Europe',
    'Malaysia': 'Asia',
    'Mexico': 'North America',
    'Philippines': 'Asia',
    'Poland': 'Europe',
    'Peru': 'South America',
    'Guatemala': 'North America',
    'Russia': 'Europe',
    'Romania': 'Europe',
    'South Africa': 'Africa',
    'Saudi Arabia': 'Asia',
    'Singapore': 'Asia',
    'Brazil': 'South America',
    'Thailand': 'Asia',
    'Turkey': 'Asia',
    'Taiwan': 'Asia',
    'United Kingdom': 'Europe',
    'Ukraine': 'Europe',
    'United States': 'North America',
    'Vietnam': 'Asia',
    'Israel': 'Asia',
    'Denmark': 'Europe',
    'India': 'Asia',
    'U.A.E': 'Asia',
    'Egypt': 'Africa',
    'Ethiopia': 'Africa',
    'Spain': 'Europe',
    'Hong Kong': 'Asia',
    'Greece': 'Europe',
    'France': 'Europe',
    'Switzerland': 'Europe'
}
df_train['customer_country.1'] = df_train['response_corporate'].map(country_to_continent)
df_test['customer_country.1'] = df_test['response_corporate'].map(country_to_continent)


df_train['business_area'] = df_train['business_area'].fillna(df_train.groupby('bant_submit')['business_area'].transform(lambda x: x.mode().iloc[0]))

# Fill NaN values in 'business_subarea' with the mode of each 'another_column'
df_train['business_subarea'] = df_train['business_subarea'].fillna(df_train.groupby('bant_submit')['business_subarea'].transform(lambda x: x.mode().iloc[0]))

df_test['business_area'] = df_test['business_area'].fillna(df_test.groupby('bant_submit')['business_area'].transform(lambda x: x.mode().iloc[0]))

# Fill NaN values in 'business_subarea' with the mode of each 'another_column'
df_test['business_subarea'] = df_test['business_subarea'].fillna(df_test.groupby('bant_submit')['business_subarea'].transform(lambda x: x.mode().iloc[0]))

customeridx_counts = df_train['customer_idx'].value_counts()
df_train['customer_idx_freq'] = df_train['customer_idx'].apply(lambda x: customeridx_counts[x])
df_test['customer_idx_freq'] = df_test['customer_idx'].apply(lambda x: customeridx_counts.get(x, 0))

lead_owner_counts = df_train['lead_owner'].value_counts()
df_train['lead_owner_freq'] = df_train['lead_owner'].apply(lambda x: lead_owner_counts[x])
df_test['lead_owner_freq'] = df_test['lead_owner'].apply(lambda x: lead_owner_counts.get(x, 0))

response_corporate_counts = df_train['response_corporate'].value_counts()
df_train['response_corporate_freq'] = df_train['response_corporate'].apply(lambda x: response_corporate_counts[x])
df_test['response_corporate_freq'] = df_test['response_corporate'].apply(lambda x: response_corporate_counts.get(x, 0))

most_frequent = df_train['customer_country'].mode()[0]
df_train['customer_country'].fillna(most_frequent, inplace=True)
df_test['customer_country'].fillna(most_frequent, inplace = True)

def fillna_with_mode(x):
    mode_value = x.mode()
    if mode_value.empty:
        return x
    else:
        return x.fillna(mode_value[0])

df_train['customer_type'] = df_train.groupby('customer_idx')['customer_type'].transform(fillna_with_mode)
df_test['customer_type'] = df_test.groupby('customer_idx')['customer_type'].transform(fillna_with_mode)

# 학습 데이터에서 'enterprise' 범주별 평균 계산
enterprise_mean = df_train.groupby('enterprise')['historical_existing_cnt'].mean()

# 결측치 처리 함수 정의                                                                           #추가
def fillna_by_enterprise_mean(row):
    if pd.isnull(row['historical_existing_cnt']):
        return enterprise_mean[row['enterprise']]
    else:
        return row['historical_existing_cnt']

df_train['historical_existing_cnt'].fillna(-1, inplace=True)

# 학습 데이터에서 적용한 동일한 변환을 테스트 데이터에 적용
df_test['historical_existing_cnt'].fillna(-1, inplace=True)

#ver_win_rate_x 중앙값을 계산
mean_per_bu = df_train.groupby('bant_submit')['ver_win_rate_x'].median()

# ver_win_rate_x NA 값을 해당 bant_submit 평균값으로 채움
df_train['ver_win_rate_x'].fillna(df_train['bant_submit'].map(mean_per_bu), inplace=True)

# 학습 데이터에서 계산한 평균값을 테스트 데이터의 결측치에 적용
df_test['ver_win_rate_x'].fillna(df_test['bant_submit'].map(mean_per_bu), inplace=True)

#ver_win_ratio_per_bu의 중앙값을 계산
mean_per_bu = df_train.groupby('business_unit')['ver_win_ratio_per_bu'].median()

# ver_win_ratio_per_bu의 NA 값을 해당 business_unit의 평균값으로 채움
df_train['ver_win_ratio_per_bu'].fillna(df_train['business_unit'].map(mean_per_bu), inplace=True)

# 학습 데이터에서 계산한 중앙값을 테스트 데이터의 결측치에 적용
df_test['ver_win_ratio_per_bu'].fillna(df_test['business_unit'].map(mean_per_bu), inplace=True)

# Calculate the average value of ver_win_ratio_per_bu grouped by bant_submit
mean_per_bant_submit = df_train.groupby('bant_submit')['ver_win_ratio_per_bu'].median()

# Fill the NA value of ver_win_ratio_per_bu with the average value of the corresponding bant_submit
df_train['ver_win_ratio_per_bu'].fillna(df_train['bant_submit'].map(mean_per_bant_submit), inplace=True)

# Apply the average value calculated from the training data to the missing values of the test data
df_test['ver_win_ratio_per_bu'].fillna(df_test['bant_submit'].map(mean_per_bant_submit), inplace=True)

#ver_win_ratio_per_bu의 평균값을 계산
mean_per_bu = df_train.groupby('business_unit')['com_reg_ver_win_rate'].mean()

# ver_win_ratio_per_bu의 NA 값을 해당 business_unit의 평균값으로 채움
df_train['com_reg_ver_win_rate'].fillna(df_train['business_unit'].map(mean_per_bu), inplace=True)

# 학습 데이터에서 계산한 평균값을 테스트 데이터의 결측치에 적용
df_test['com_reg_ver_win_rate'].fillna(df_test['business_unit'].map(mean_per_bu), inplace=True)

#com_reg_ver_win_rate 평균값을 계산
mean_per_submit = df_train.groupby('bant_submit')['com_reg_ver_win_rate'].mean()

# com_reg_ver_win_rate NA 값을 해당 business_unit의 평균값으로 채움
df_train['com_reg_ver_win_rate'].fillna(df_train['bant_submit'].map(mean_per_submit), inplace=True)

# 학습 데이터에서 계산한 평균값을 테스트 데이터의 결측치에 적용
df_test['com_reg_ver_win_rate'].fillna(df_test['bant_submit'].map(mean_per_submit), inplace=True)

df_train['historical_conversion_rate_per_customer'] = df_train['historical_existing_cnt'] / df_train.groupby('customer_idx')['historical_existing_cnt'].transform('sum')

df_test['historical_conversion_rate_per_customer'] = df_test['historical_existing_cnt'] / df_test.groupby('customer_idx')['historical_existing_cnt'].transform('sum')

mean_conversion_rate = df_train['historical_conversion_rate_per_customer'].mean()
df_train['historical_conversion_rate_per_customer'].fillna(mean_conversion_rate, inplace=True)

mean_conversion_rate_ts = df_test['historical_conversion_rate_per_customer'].mean()
df_test['historical_conversion_rate_per_customer'].fillna(mean_conversion_rate_ts, inplace=True)

mean_conversion_rate = df_train['historical_conversion_rate_per_customer'].mean()
df_train['historical_conversion_rate_per_customer'].fillna(mean_conversion_rate, inplace=True)

mean_conversion_rate_ts = df_test['historical_conversion_rate_per_customer'].mean()
df_test['historical_conversion_rate_per_customer'].fillna(mean_conversion_rate_ts, inplace=True)
df_train['product_category'] = df_train['product_category'].fillna(df_train.groupby('business_unit')['product_category'].transform(lambda x: x.mode().iloc[0]))
df_test['product_category'] = df_test['product_category'].fillna(df_test.groupby('business_unit')['product_category'].transform(lambda x: x.mode().iloc[0]))

df_train['product_subcategory'] = df_train['product_subcategory'].fillna(df_train.groupby('bant_submit')['product_subcategory'].transform(lambda x: x.mode().iloc[0]))
df_test['product_subcategory'] = df_test['product_subcategory'].fillna(df_test.groupby('bant_submit')['product_subcategory'].transform(lambda x: x.mode().iloc[0]))
df_train['customer_job'] = df_train.groupby('customer_idx')['customer_job'].transform(fillna_with_mode)
df_test['customer_job'] = df_test.groupby('customer_idx')['customer_job'].transform(fillna_with_mode)
def fillna_with_mode_safe(x):
    if x.mode().empty:
        return 'Unknown'  # 또는 다른 처리 방법을 선택할 수 있음
    else:
        return x.mode().iloc[0]

df_train['product_modelname'] = df_train['product_modelname'].fillna(df_train.groupby('product_subcategory')['product_modelname'].transform(fillna_with_mode_safe))
df_test['product_modelname'] = df_test['product_modelname'].fillna(df_test.groupby('product_subcategory')['product_modelname'].transform(fillna_with_mode_safe))

def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "response_corporate_1",
    "expected_timeline"
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

# Cat2Vec 모델 정의 및 학습
cat2vec_model = Word2Vec(sentences=df_all[label_columns].values.tolist(), vector_size=100, window=5, min_count=1, workers=4,seed=400)

# 모든 벡터의 평균 계산
average_vector = np.mean(cat2vec_model.wv.vectors, axis=0)

for col in label_columns:
    # 범주형 변수의 모든 값에 대한 임베딩을 평균하여 범주형 변수를 임베딩으로 대체
    df_all[col] = df_all[col].apply(lambda x: cat2vec_model.wv[x].mean() if x in cat2vec_model.wv else average_vector)

for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)

df_train.fillna(df_train.mean(), inplace=True)
df_test.fillna(df_test.mean(), inplace=True)

df_train.drop(['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver','ver_cus','ver_pro'], axis=1, inplace=True)

In [None]:
from imblearn.combine import SMOTEENN
# 피처 데이터와 타겟 데이터를 정의합니다.
X = df_train.drop('is_converted', axis=1)
y = df_train['is_converted']

smote_enn = SMOTEENN(random_state=400)

X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [None]:
model = LGBMClassifier(scale_pos_weight=8.25,
                       learning_rate=0.085, 
                       num_iterations = 1000, # n_estimator 랑 같은 것 같음
                       max_depth = 5,
                       num_leaves = 31,
                       n_jobs=-1,
                       boost_from_average=False,
                       objective = 'binary',
                       random_state=42)

clf = AdaBoostClassifier(estimator = LGBMClassifier(scale_pos_weight = 8.25, 
                                                         learning_rate = 0.085, 
                                                         num_iterations = 1000, 
                                                         max_depth = 5, 
                                                         num_leaves = 31, 
                                                         random_state = 42, 
                                                         n_jobs = -1, 
                                                         objective = 'binary',
                                                        boost_from_average = False),
                        n_estimators = 15, learning_rate = 0.001, random_state = 42)

clf.fit(x_train.fillna(0), y_train)

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    
    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

pred = clf.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

x_test.drop(['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver','ver_cus','ver_pro'], axis=1, inplace=True)

test_pred = clf.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)