In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import random
import os
import warnings
import matplotlib.pyplot as plt # 득점모델 변수 중요도
import seaborn as sns

warnings.filterwarnings("ignore")

def visual_df(train):
    feature_selection = [f for f in train.columns if train[f].isnull().sum()<30000 and f != 'customer_country.1']
    train = train[feature_selection]
    feature_summary = pd.DataFrame(train.dtypes, columns=["Data Type"])
    feature_summary.reset_index(inplace=True)
    feature_summary.rename(columns={'index': 'Feature 이름'}, inplace=True)
    feature_summary['고유값 수'] = train.nunique().values
    feature_summary['결측값 수'] = train.isnull().sum().values
    return feature_summary
    

def seed_everything(seed=3):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
seed_everything()

def get_clf_eval(y_test, y_pred=None):

    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

def log_transform(data,cols):
    for col in cols :
        data[col]=np.log1p(data[col]) 
    return data 

def label_encoding(series):
    # """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}
    # 모든 요소를 문자열로 변환
    mode = np.flip(series.value_counts().index.values)
    # unique value 찾고 아스키코드 기준으로 sort
    # my_dict을 통해 unique value에 대응하는 label 생성 
    for idx, value in enumerate(mode):
        my_dict[value] = idx
    # my_dict의 index와 매치되는 series값을 my_dict의 value로 변환한다.
    series = series.map(my_dict)

    return series

def same_word_detect(df, categorical_list):
    for feature in categorical_list:
        df[feature] = df[feature].str.lower()
        # 특수문자를 대체 문자열 지정
        replacement = {'/': '', 
                       '-':'', 
                       '_':'', 
                       ' ':'', 
                       '~':'',
                       '&':'',
                       '.':'',
                       ',':''}
        # replace() 함수를 사용하여 특수 문자 대체
        df[feature].replace(replacement, inplace=True)
    return df

def preprocessing(train, test):

    label_columns = [feature for feature in train.columns if train[feature].dtype=='O' or feature == 'lead_owner' ]
    

    df_all = pd.concat([train[label_columns], test[label_columns]])
    df_all = df_all.astype(str)
    df_all = same_word_detect(df_all, label_columns)
    temp = df_all
 
    
    # label encoding
    for col in label_columns:
        df_all[col] = label_encoding(df_all[col])
    # test set 분리 
    for col in label_columns:  
        train[col] = df_all.iloc[: len(train)][col]
        test[col] = df_all.iloc[len(train) :][col]
    
    # 결측값 처리
    train = train.fillna(0)
    test = test.fillna(0)
    
    return train, test

def model_train(train, test, model):

    train, test = preprocessing(train, test)
    x_train, x_val, y_train, y_val = train_test_split(
    train.drop("is_converted", axis=1),
    train["is_converted"],
    test_size=0.2,
    stratify=train["is_converted"],
    shuffle=True,
    random_state=42,)
    
    model = model
    model.fit(x_train, y_train)
    pred = model.predict(x_val)
    metrics = get_clf_eval(y_val, pred)
    print(f'validaion에서 true: {sum(pred)}')
    print(metrics)

    ftr_importances_values = model.feature_importances_
    ftr_importances = pd.Series(ftr_importances_values, index=x_train.columns)
    ftr_top = ftr_importances.sort_values(ascending=False)[:20]
    print(ftr_top[:5])
    plt.figure(figsize=(8, 6))
    sns.barplot(x=ftr_top, y=ftr_top.index)
    plt.show()
    return model, metrics

In [4]:
train = pd.read_csv('/home/workspace/LGamiers/EDA/train.csv')
test = pd.read_csv("/home/workspace/LGamiers/Model/submission.csv")
train, test = preprocessing(train, test)

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
def per_customer_idx_fit_predict(train, test):
    dic = {}

    for id in tqdm(test['customer_idx'].value_counts().index):
        if len(train[train['customer_idx'] == id]) > 10:
            train_x = train[train['customer_idx'] == id]
            train_y = train_x['is_converted']

            per_model = DecisionTreeClassifier()
            per_model.fit(train_x, train_y)

            key = str(id)
            value = per_model
            dic[key] = value

    per_model = DecisionTreeClassifier(random_state=42)
    train_x = train.drop('is_converted', axis= 1)
    train_y = train['is_converted']
    per_model.fit(train_x, train_y)
    dic['nan'] = per_model

    # test predict
    for i in tqdm(range(len(test))):
        key = test['customer_idx'][i]
        if key in dic.keys():
            model = dic[key]
        else:
            model = dic['nan']

        test_x = test.drop(['is_converted', 'id'], axis=1)
        row = pd.DataFrame(test_x.iloc[i])
        test['is_converted'][i] = model.predict(row.T)

    return test

In [67]:
# 예측 전 
sum(test['is_converted'])

880

In [68]:
train = pd.read_csv('/home/workspace/LGamiers/EDA/train.csv')
test = pd.read_csv("/home/workspace/LGamiers/Model/submission.csv")
train, test = preprocessing(train, test)
pred_df = per_customer_idx_fit_predict(train, test)

100%|██████████| 4069/4069 [00:01<00:00, 3069.25it/s]
100%|██████████| 5271/5271 [00:06<00:00, 770.89it/s]


In [71]:
# 예측 후
sum(pred_df['is_converted'])

939

In [72]:
# 제출 파일 저장
pred_df.to_csv("submission.csv", index=False)

In [70]:
pred_df

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.00,15568,4,0.073248,47466,31,1,53.0,0.0,...,51,449,1,0,0.001183,0.049840,10,74,1001,False
1,9738,0.25,6645,2,0.000000,5405,31,0,0.0,0.0,...,50,449,0,0,0.000013,0.000000,3,85,1014,True
2,8491,1.00,15517,4,0.000000,13597,30,0,0.0,0.0,...,47,448,0,0,0.000060,0.131148,5,65,461,True
3,19895,0.50,6644,4,0.118644,17204,33,1,0.0,0.0,...,50,446,0,0,0.001183,0.049840,10,86,1064,False
4,10465,1.00,15311,4,0.074949,2329,31,1,2.0,1.0,...,51,448,1,1,0.003079,0.064566,11,79,1055,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13855,0.50,15623,3,0.000000,40292,33,1,10.0,0.0,...,51,449,0,0,0.000000,0.000000,12,86,1066,False
5267,7979,0.25,14357,2,0.000000,47466,33,1,0.0,0.0,...,50,449,0,0,0.000000,0.000000,12,86,797,True
5268,12887,0.75,8100,3,0.000000,46227,30,1,0.0,0.0,...,51,448,0,0,0.000000,0.000000,12,86,1066,False
5269,17530,0.00,15435,2,0.000000,45667,31,0,0.0,0.0,...,39,449,0,0,0.000000,0.000000,12,86,716,False
