## 1. Import Library and define function 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
def get_clf_eval(y_test, y_pred=None):
    confusion = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['T[0]', 'F[1]'], columns = ['pred_T[0]', 'pred_F[1]'])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [3]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx + 1
    series = series.map(my_dict)

    return series

### 1.1 load data

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("origin_submission.csv")
df_all = pd.concat([df_train, df_test], axis = 0)
df_all.shape, df_train.shape, df_test.shape

((64570, 30), (59299, 29), (5271, 30))

In [5]:
df_all.isnull().sum()

bant_submit                    0
customer_country             982
business_unit                  0
com_reg_ver_win_rate       48214
customer_idx                   0
customer_type              45418
enterprise                     0
historical_existing_cnt    49539
id_strategic_ver           60533
it_strategic_ver           63396
idit_strategic_ver         59359
customer_job               20172
lead_desc_length               0
inquiry_type                2233
product_category           21232
product_subcategory        54542
product_modelname          54779
customer_country.1           982
customer_position              0
response_corporate             0
expected_timeline          33271
ver_cus                        0
ver_pro                        0
ver_win_rate_x             43780
ver_win_ratio_per_bu       47360
business_area              43780
business_subarea           57228
lead_owner                     0
is_converted                5271
id                         59299
dtype: int

## 2. Data preprocessing

### 2.1 drop columns

In [6]:
# 열 삭제
drop_col = ['customer_country', 'customer_country.1', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
            'product_subcategory', 'product_modelname', 'business_area', 'business_subarea', 'ver_cus', 'ver_pro']

df_all.drop(columns = drop_col, inplace = True)

### 2.2 결측치 처리(수치형 데이터)

In [7]:
# 결측값 0으로 넣을 컬럼
fillna_col = ['com_reg_ver_win_rate', 'historical_existing_cnt',
             'ver_win_rate_x', 'ver_win_ratio_per_bu']

for col in fillna_col:
    df_all[col] = df_all[col].fillna(0)

In [8]:
# df_all['country']
df_all['country'] = pd.read_csv('country_processing.csv')['country1']

In [9]:
df_all['customer_type'] = df_all['customer_type'].str.lower()
df_all['customer_type'] = df_all['customer_type'].replace(['etc.', 'other', 'others'], 'etc')
df_all['customer_type'] = df_all['customer_type'].replace(['end-customer', 'end customer', 'end-user'], 'end_user')
df_all['customer_type'] = df_all['customer_type'].replace(['specifier/ influencer', 'specifier / influencer'], 'influencer')
df_all['customer_type'] = df_all['customer_type'].replace(['homeowner', 'home owner'], 'home_owner')
df_all['customer_type'] = df_all['customer_type'].replace(['software/solution provider', 'software / solution provider'], 'solution_provider')
df_all['customer_type'] = df_all['customer_type'].replace(['engineer', 'hvac engineer'], 'engineer')
df_all['customer_type'] = df_all['customer_type'].replace(['distributor', 'dealer/distributor'], 'distributor')
df_all['customer_type'].fillna('etc', inplace = True)

In [10]:
# value_counts 가 1개인 것들 etc로 분류
value_counts = df_all['customer_type'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['customer_type'] = df_all['customer_type'].apply(lambda x: 'etc' if x in values_to_replace else x)

In [11]:
df_all['inquiry_type'] = df_all['inquiry_type'].str.lower()
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['etc.', 'other', 'other_', 'others'], 'etc')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['sales inquiry', 'sales', 'probeam precio'], 'sales inquiry')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['quotation or purchase consultation', 'request for quotation or purchase', 'quotation_or_purchase_consultation', 'purchase or quotation', 'quotation_', 'purchase'], 'quotation_or_purchase_consultation')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['usage or technical consultation', 'technical consultation', 'request for technical consulting', 'usage_or_technical_consultation', 'technical_consultation', 'technical'], 'usage or technical consultation')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['vui lòng báo giá giúp mình sản phẩm đo thân nhiệt xin cảm ơn', 'tôi cần tham khảo giá và giải pháp từ lg'], 'quotation_or_purchase_consultation')
df_all['inquiry_type'] = df_all['inquiry_type'].replace(['toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung'], 'product information')
df_all['inquiry_type'].fillna('etc', inplace = True)

In [12]:
# value_counts 가 1개인 것들 etc로 분류
value_counts = df_all['inquiry_type'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['inquiry_type'] = df_all['inquiry_type'].apply(lambda x: 'etc' if x in values_to_replace else x)

In [13]:
# custoper_position 전처리
df_all['customer_position'] = df_all['customer_position'].replace(['ceo/founder', 'partner', 'vice president', 'c-level executive', 'director', 'vicepresident', 'c-levelexecutive', 'vp','leadership/executive office/owner', 'president'
                                                                   'principal & director', 'business partner', 'chairman', 'co-founder', 'chief executive officer', 'subsidiary sales (ise)', 'ceo/fundador', 'gerente', 'the big boss',
                                                                  'principal & director', 'president'], 'ceo')
df_all['customer_position'] = df_all['customer_position'].replace(['consultant', 'commercial consultant', 'architecture/consult', 'architect/consultant'], 'consult')
df_all['customer_position'] = df_all['customer_position'].replace(['customer', 'customer_position'], 'customer')
df_all['customer_position'] = df_all['customer_position'].replace(['decision-influencer', 'decision maker', 'decision influencer'], 'decision')
df_all['customer_position'] = df_all['customer_position'].replace(['distributor', 'cargo'], 'distributor')
df_all['customer_position'] = df_all['customer_position'].replace(['physics teacher','assistant professor','maths lecturer','science teacher','guest faculty','physics faculty','teacher/middle school coordinator','prof.',
                                                                   'academic specialist','principal at oxford integrated pu science college','math and physics teacher','professor of mathematics','physics and mathematics teacher',
                                                                   'assistant professor of english','educator','professor','quantitative aptitude faculty','english trainer for ielts,toefl,pte,gre,sat exams.','associate professor',
                                                                   'pgt physics','education professional','chemistry teacher','director cum faculty at gaining apex coaching centre','teacher','senior lecturer',
                                                                   'neet/ olympiad expert faculty','associate professor in electronics engg','education','pgt chemistry', 
                                                                   'academic coordinator/ post graduate teacher (accountancy, business studies)/ tgt (ict)','assistant professor of enlish'], 'education')
df_all['customer_position'] = df_all['customer_position'].replace(['entry level','intern','trainee','entrylevel','employee'], 'employee')

df_all['customer_position'] = df_all['customer_position'].replace(['exhibition','exhibitiontv'], 'exhibition')
df_all['customer_position'] = df_all['customer_position'].replace(['hospital', 'medical device manufacturer', 'medical imaging specialist', 'tierarzt', 'surgery professional', 'pathologist'], 'medical')
df_all['customer_position'] = df_all['customer_position'].replace(['not applicable','none','other','others'], 'etc')
df_all['customer_position'] = df_all['customer_position'].replace(['manager','associate/analyst','consulting','lider de desarrollo','decision-maker','business unit director','business development','operations',
                                                                   'product management','market intelligence/research'], 'pm')
df_all['customer_position'] = df_all['customer_position'].replace(['asst prof.','professional trainer','radiology professional'], 'professional')
df_all['customer_position'] = df_all['customer_position'].replace(['this is a consume display requirement for home purpose.','unpaid','homeowner','no influence'], 'unemployed')

In [14]:
# value_counts가 1인거 etc로 묶기
value_counts = df_all['customer_position'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['customer_position'] = df_all['customer_position'].apply(lambda x: 'etc' if x in values_to_replace else x)
df_all['customer_position'].fillna('etc', inplace = True)

In [15]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64570 entries, 0 to 5270
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              64570 non-null  float64
 1   business_unit            64570 non-null  object 
 2   com_reg_ver_win_rate     64570 non-null  float64
 3   customer_idx             64570 non-null  int64  
 4   customer_type            64570 non-null  object 
 5   enterprise               64570 non-null  object 
 6   historical_existing_cnt  64570 non-null  float64
 7   customer_job             44398 non-null  object 
 8   lead_desc_length         64570 non-null  int64  
 9   inquiry_type             64570 non-null  object 
 10  product_category         43338 non-null  object 
 11  customer_position        64570 non-null  object 
 12  response_corporate       64570 non-null  object 
 13  expected_timeline        31299 non-null  object 
 14  ver_win_rate_x           645

In [16]:
# 전처리 필요한 4개 컬럼 간단하게
value_counts = df_all['customer_job'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['customer_job'] = df_all['customer_job'].apply(lambda x: 'other' if x in values_to_replace else x)
df_all['customer_job'].fillna('other', inplace = True)

value_counts = df_all['product_category'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['product_category'] = df_all['product_category'].apply(lambda x: 'etc' if x in values_to_replace else x)
df_all['product_category'].fillna('etc.', inplace = True)

value_counts = df_all['country'].value_counts()
values_to_replace = value_counts[value_counts == 1].index
df_all['country'] = df_all['country'].apply(lambda x: 'etc' if x in values_to_replace else x)
df_all['country'].fillna('etc', inplace = True)

In [17]:
expected_timeline = df_all['expected_timeline']
expected_timeline = expected_timeline.str.replace('-', '')
expected_timeline = expected_timeline.str.replace('_', '')
expected_timeline = expected_timeline.str.replace('~', '')
expected_timeline = expected_timeline.str.replace(' ', '')
expected_timeline = expected_timeline.str.replace('.', '')
expected_timeline = expected_timeline.str.replace('etc', 'others')

In [18]:
sort_df = expected_timeline.fillna('others')

In [19]:
values = ['aggressivepricerequired',
       'alreadycloseinjulypurchsewithrd', 'alreadyconnectwithpartner',
       'alreadyindiscussionwithpartnerfrombangalore(kohinoor)andwithmr.indraneelfromcorporateoffice',
       'alreadyintouchwithcustomer',
       'alreadyintouchwithcustomerfromlast10days.droppingitbecausewearealreadyintouch',
       'alreadyintouchwithhimsincelong.hehasnotfinalizedyet.leadforwardedtord',
       'alreadyintouchwithpartner.', 'alreadyintouchwiththereteam',
       'alreadyourcustomer,havesharedpricesfornewrequirement',
       'alreadypurchasewithrd', 'alreadysharedquotationthroughsi.',
       'alreadytouchwithcustomerforkioskrequirement',
       'alreadytouchwithcustomers',
       'alreadyworkingonidbwillcreateopppostclarfyingonsignage',
       'alreadyworkingwithenduseronthisrequirement.',
       'askedtocallbacklater,ihadsharedthisleadwithrdtotakeitfurther.',
       'askedtocallbacktomorrow,ihadsharedthisenquirywithrd.',
       'askedtocallbacktomorrowmorning,notresponding',
       'askedtocalllater.','(selectidtimeline)', '09022022requestedforboqofrequirement',
       '14:0015:00',
       '14thaugaggressivepricerequiredashehasgotpricefromoutsidedelhi',
       '14thfeb2022shareddetails.awaitingupdatefromcustomer.',
       '25nov2021nomobilenumber,sentamailtocustomer',
       '29thsep2021:nosuchrequirementasofnow', '14thfeb2022shareddetailsawaitingupdatefromcustomer','alreadyintouchwithcustomerfromlast10daysdroppingitbecausewearealreadyintouch',
       'alreadyintouchwithhimsincelonghehasnotfinalizedyetleadforwardedtord',
       'alreadyintouchwithpartner', 'alreadysharedquotationthroughsi',
       'alreadyworkingwithenduseronthisrequirement',
       'askedtocallbacklater,ihadsharedthisleadwithrdtotakeitfurther','alreadyindiscussionwithpartnerfrombangalore(kohinoor)andwithmrindraneelfromcorporateoffice',
       'askedtocalllater', 'askedtocallon4thmay',
       'askedtocontacthimafter8thjan', 'askedtosenddetails',
       'askedtosharedetailsonmailhewillupdateifthereisanyrequirement',
       'assignedtopartnerintialmeetingdonewillconverttoopppostcompleteinfo',
       'atpresentnotrequired,', 'beingfollowedup',
       'bodeli,requirementpendingnewquotesendcloseinthismonthend',
       'budgetissue', 'budgetlowhenceclienthavenotwentahead',
       'budgetproblem', 'busy,calllater', 'busyneedtocallback',
       'callafter3june', 'callanddiscusednorequirement',
       'callanddiscusedtocustomecustomerwantsdemo',
       'callbackagainon30/04', 'callbacklater',
       'callbacklater,infirstweekmarch', 'calledtoknowthepriceofidb',
       'callnotconnecting', 'callnotpicking',
       'clienthaveseenthedemoheneededforbiggerroomandthecameraqualityandspeakertrackingwerethemainrequiredfeatureasofnow,hisbudgetisaround2lacsandneedbiggersizesolutionthereisnoresponsefromclientpostdemohenceclosing',
       'clientiscrosscheckingonpricesitsarccaseofsubwayandthefranchiseeistryingtogetpricingfromelsewhere',
       'clientisexploringhewilldiscussoncedetailssentdetailsmailedtohimasperrecentdiscussion,heisnotexploringasofnow',
       'clientislookingfor86"displaywithvcsolution,theywillgofordemonextweekanddecideonprocuringourtr3dj,detailssharedwithclientonmail',
       'clientisnothavinganyrequirement,hewasonlybrowsingthroughtheproduthenceclosiginsystemalthoughthedetailsofidbaremailedtoclient',
       'clientisnotlookingnowandhewasinterestedwithonlyinbuiltopsoptionhenceclosinginsystemasheisnotrespondingtocalls',
       'clientisnotrespondingcalls,followingupandkeeptheupdatetracked',
       'clientnotanweringcallsandaskedthereisnorequirementhewastryingtocheckpricesforone65touchrequirementofexistingclient',
       'clientnotinterestedinproductreceingcallandnotansweringproperly',
       'clientshallgetbackforexploringdemoofidb,postthattheywilldecideonprocuringthesame',
       'clientwasbusy,askedtocallbacklater,interestedinexploringouridb',
       'clientwasdrivingandaskedtocallbacklater,willfolloupandkeepposted',
       'clientwasexploringonproductsbuthedonthaveanyplanstopurchasecurrentlyhehaveholdedonhisrequirementandwillconnectifitreiniitaties',
       'clientwasinquiringforpricesalreadyquotedbyfewpartnerstheyevenpurchasedasperrecentupdatebutclienthasnotconfiermedpartnername',
       'clientwaskeentoexploreouridbsku,infollowup', 'cmsreqired',
       'communicationnumbernotavailable',
       'concernedpersonnotavailable,willcalllater',
       'conference/meetingroom/collaborationspaces',
       'connectedwithpartnerforstudiosetupwithcamera',
       'connectingforissue', 'connectwithpartner',
       'contactdetailsprovidedarewrong,sentmailforcorrectcontactdetails',
       'convertedthisleadintoopportunity', "couldn'tconect",
       "couldn'tconnect", 'couldntconnect',
       'customehasnotreceivecallason31stmayspokewithcustomer,hewillcomefordemoinnextweek14062022',
       'customepurchaseconsumerproduct', 'customerasktocallbacktomorrow',
       'customerbudgetis125+taxfor75inchidbihavegivenournoregretofferandalsoexplainedwhylgisbetterandpremiumhesaidhewillcomebackinadayortwo',
       'customergettingpricefromjaipurwhichlowthanouroperatingprice',
       'customerhasbeennotansweringcall', 'customerhasnotansweringcall',
       'customerhasnotansweringcall/customerwant86"interactivedisplay,hewillpurchasewithinnext6month,weareforwardingtolocalrdtotakethisfurther',
       'customerhasnotansweringcall/quotationhasbeensendtocustomer',
       'customerisbusyaskustocallagain,following',
       'customerislookingforresidential43inchdisplayforhomeuse',
       'customerneedshortthrowprojector',
       'customernotansweringcall,willcallhimagainandupdate',
       'customerphoneisgetsswitchedoff',
       'customerpurchasebenqforpricedifferance',
       'customervisitatuvbusinessandseethedemowithin2to3days',
       'customerwantdemoofidbhewillcomefordemoinnextweekafterfollowingupwithcustomermultipletimesthereisnoupdateleadhasbeenforwardedtord',
       'customerwantproductt15000',
       'customerwillbecomingfordemoon28thmaycustomerdidnotcamefordemoihaveshareddetailswithrdtotakeitfurtherhencedroppingthislead',
       'customerwillcomefordemoinnextweek,thanhewilldecidefurther',
       'december2022', 'delhienquirey', 'demoalignedforclient',
       'democompleted,customeraskedtocomefeb10thforclosure', 'demodone',
       'demodoneanddetailsshared',
       'demoplanned,willupdatefurtherstatusonceitscompleted',
       'demoscheduledfor24thoct', 'demoscheduledforfirstweekfeb',
       'demotobealigned', 'demotobeplanned',
       'detailsharedwithhimonemailheislookingforoneunitof65inchforhisconferenceroom',
       'detailssend', 'detailssendquoterequire', 'detailssendrequiredemo',
       'detailsshared', 'detailsshared,beingfollowedup',
       'detailsshared,toplandemoindecember',
       'detailssharedfollowupunderprogress',
       'detailssharedfor75tr,followupinfirstweekofoct',
       'detailssharedfuunderprogress',
       "detailssharedonwhatsapp,he'llrevertback",
       'detailssharedwithconcernpersonwillaskbdotofollowup',
       'detailssharedwithcustomer,alsoihadsharedleadwithrdtotakeitfurther',
       'detailssharedwithcustomer,hewillupdateonfurtheractionafterevaluatingotherbrandsaswell',
       'detailssharedwithcustomerason4thmaytriedmultipletimesbutnotreachable',
       'detailssharedwithcustomeronmai',
       'detailssharedwithcustomeronmail',
       'detailssharedwithpartnerhehasquotedinourexistingcustomeronly',
       'detailssharedwithrdforfurtheraction,',
       'dicsussedwithclient,detailssharedonmail,clienthavenobudgetstobuynowhenceclosinginthesystem',
       "didn'tpickupthecall", "didn'trespond", "didn'trespondtocalls",
       "dind'trespond", 'discusedwithankitindehli',
       'discussedandshareddetailswithclientfordigitalsignagetheyarenotplanningtopurchsethesamecurrentlyhenceclosinginthesystem',
       'discussedwithclient,detailsmailedforidbandonequick,theyhavenoplantobuyfornowhenceclosinginthesystem',
       'discussedwithclient,detailsmailedtheyarenotplanningtopurchasefornowhenceclosinginthesystem',
       'discussedwithclient,detailssharedonmailclienthavenopurchaseplansfornowhenceclosingthesameinsystem',
       'discussedwithclient,heshallsendthedetailsoftheirofficerequirement',
       'discussedwithclient,quotesent',
       'discussedwithclient,theywerejustevaluatingtheproduct,theyhavenoplanstobuythesame,henceclosinginthesystem',
       'discussedwithclientdemoalignedoncedone,willupdate',
       'discussedwithclientdetailsmailedclientdonthavebudgetforpurchasenowhenceclosinginthesystem',
       'discussedwithclientdetailsmailedforonequick,theyarenotplanningtobuythesamefornowhenceclosinginthesystem',
       'discussedwithclientdetailsmailedtheyhavebudgeissuefornowsotheywontbuyhenceclosinginyhesystem',
       'discussedwithclientdetailsmailedtoclientnobudgetsfornowwontbuyhenceclosinginthesystem',
       'discussedwithclienttheyarelookingforaiosolutionwithvctheywillcallafter20thforthedemoandthenpurchasethroughgem',
       'discussedwithclienttheyarenotevaluatuatingidbasofnow,theyhavenoplanstobuyhenceclosinginthesystem',
       'discussedwithclienttheyhavenobudgetsforpurchasefornow,theywontbuyhenceclosinginthesystem',
       'discussedwithclienttheyneed24inchdisplayfordigitalsignagementionedforouravailablesizes,heshallrevert',
       'discussedwithclienttheyneedforcsractivitypricesexpectationisverymuchlow',
       'discussedwithclientweneedtoaligndemotheexpectationisconnectingappledevicesbutnotmandatoryoncedemoisdone,weshallupdate',
       'discussedwithclientwillplanfordemo',
       'discussedwithpartnerheworkswithcloudwalkerandwantedtoconnectwithusforsomeproductdetailsasofnow,donthaveanyinquirytoworkandhenceclosinginsystem',
       'discussedwiththeclient,tr3bg&onequickworksdemoscheduledon27thjuly2022postdemotheywilldecideonprocuringtheunit',
       'discussedwiththeclientaskedtocallbackonmondaywillkeepafollowup',
       'discussedwiththeclienthedonthavebudgetsdetailsmailedhewontbuynowhenceclosinginthesystem',
       'discussedwiththeclientpricessharedforonequickandidbonmailcustomertoconfirmaboutdemooftheskuandfinaliseonprocurement',
       'discussedwiththeclienttheyareintererstedinourtr3djseriesdemoplannedfor29thjuneposttheywillplanonprocurementwillkeepupdate',
       'discussedwiththeclienttheypurchased65um3dffromthechannelunabletotrackthesupplierhenceclosinginthesystemmailsenttoclientforourdisplayrange',
       'discussiontobedonewithcustomer',
       'donthaveanyimmediaterequirementhemaypurchaseafterjune',
       'donthavebudgethewantdotled', 'don’thavebudget',
       'drop,alreadydonewithpartner',
       'drop,budgetproblemnowphonenotrespondig',
       'drop,lookongforconsumerproduct', 'drop,notintrested',
       'drop,nownotintrested',
       'drop,requirementofinteractivetouchscreentosupplyingovt,conncgtwithpartner',
       'drop,stillnoplanforpurchase', 'drop,wrongnumber',
       'dropnoplanforpurchase',
       'dropnotwillingtobuynowwilldoinfuturedetailssharedonwhatsap',
       'dropped,requiresdevicelikeonequicksharetobeusedwithsamsungtv',
       'droppedcontactnonotvalid', 'droppedhavepurchasedcloudwalker',
       'duetobudgetissue,customerhasboughtmaxhubpanel',
       'duetofinancialreasons,clienthavedroppedpurchaseplanalthoughthedetailsofidbaremailedtoclient',
       'duplicatelead',
       'duplicateleadil220100042906discussedwithclienttheirvchallisunderdevelopmentonequickdetailsmailedtoclienttheywillcallusfordemoandpurchasefinalizationoncetheirvcplaceisready',
         'emailsendfordetailsclientisexploringonsolutionsbutonlyfortheirundertanding',
       'existingcaseworkedbyourdisti,crewbusiness,henceclosingasclientisonlycheckingforcommercials',
       'existingpartner,discussedandaddressedtheconcernthepartnerhadtakenpricingandwantedtorecheckasthemodelquotedischanged',
       'exitingcustomer',
       'financeandaccountingconsultantinquiredfoe43inchaiobutdonthaveplanstobuydetailsdiscussed',
       'followingup', 'forwardedtoaurangabadrdforfurtherprocess',
       'forwardedtobdo,beingfollowedup', 'forwardedtobdotofollowup',
       'fuunderprogress','heaskedtosharebudgetoryquote,hewillcheckandconfirmifhewillrequiretheproduct',
       'heclientisnothavinganyrequirementhenceclosiginsystemalthoughthedetailsofidbaremailedtoclient',
       'hehasaskedtoconnectnextweekhewanttoseethephysicaldemo',
       'heisaneventorganizationandlookingforrentaldisplayshencedroppinginsystem',
       'heiscomparingwithsenseswhichislocalbrand',
       'heisindiscussioninternallyandwillupdate',
       'heislookingforaninteractivedisplaydiscussedwithclient,willupdateasthecaseprogress',
       'heislookingforconsumerproductforhome','heislookingforinteractivedisplayquotesharedwithhim',
       'heislookingforvideowall&idbforhisofficeourbdojayantisintouchwithhimandsharingtherequireddetailstocustomer',
       'heisrequestingdemoinaligarhup,plzsharetoupperson',
       'hejustneedbudgetorypricingwehavesharedthedetailswithhimandrequestedhimtocomefordemo',
       'heneedsforhomepurposeandisaconsumerclientspokeandguidedonthedetailsandchannelnotab2brequirement',
       'hewant32inchtvforsignagedetailssharedwithhim',
       'hewant65inchat50k', 'hewant98inchinbelow2lacs',
       'hewantdemoinaurangabad,detailssharedwithrd',
       'hewantdemoinlaturandalsodonthavemuchbudgetpartnerfollowedupmultipletimesbutnoresponse',
       'hewantdemoinsangli,ihaveloopedwithlocalpartner',
       'hewantphysicaldemooftr3bfinnashikbuttherebudgetisalsoverylowsofirsthewilldiscusswithmanagementonbudgetthenconfirmbackfordemo',
       'hewillcallbackifneeded', 'hisbudgetisunder50konly',
       'hisbudgetisverylow',
       'hisbudgetisverylowbutwearetryingtoconvincehimfor65inch',
       'idbdetaissend',
       'il220100042906repeatinquiryhenceclosingthisinsystem',
       'invalidlead',
       'isaninteriordesignerworkingonturnkeyprojectsmrbhuvneshisincontactandsharedspecsandprices',
       'itsbiharcustomer',
       'kinnarieletronisvatwahasbidforthesameandtheyaregoingtopurchasefromthem',
       'knownpartnerandalreadyworkingonacaseonlycrosscheckingpriceshencedropping',
       'knownpartnerandwehaveinstalled3videowallspanindiaforcctvwesupplied49vl5batrs62000andthenagain49vl5fatrs71000nowtheavailablemodelis49vl5gat80000+whichpartnerisrecheckingandhencemadeinquiryclosingtheinquirysi',
       'leadbelongstomaharashtra','lookingforactiveledinarangeof5lakhsnotaprospetivebuyer',
       'lookingforairconditioners',
       'lookingforcommercialtv55"detailsmailedtoclient,theyhavenoplansfornowtobuyhenceclosinginthesystem',
       'lookingforhomeuse',
       'lookingfornotmadeinchinaactiveoutdoorledforgovernmentproject',
       'lookingforoutdoordisplaycusotmerhasnoclarityonthesolutionemailsharedforunderstandingandwaitingforreply',
       'lowbudget', 'maileddetails&priceof55inches',
       'mayurelectronicconnectwithclient', 'meetingdatetobefinalised',
       'meetingplannedforfurtherdiscussion','mobileisswitchedofftried3times','mrrajnikantisallignwiththeclient',
       'needforhomesegmenttheypurchasedvu85inchtvfortheirhome',
       'needtoarrangequoteanddemo', 'needtododomestictradebiz',
       'nextweekhewillcomefordemoatlgoffice',
       'nocontactnumber,ihavedroppedaemailrequestingforthesame',
       'nodropthislead,nobudget!',
       'nofurtherrevert&clarityaftersharingdetails', 'noreqirements',
       'norequirement', 'norequirementofthistime', 'noresponse',
       'noresponseonmailsfromtheclientcouldntfindanycontactdetailsonnetaswellhencedroppinginthesystemalthoughidbandonequickdetailssharedonemail',
       'notansweringcall', 'notansweringcall,detailssharedwithrd',
       'notansweringcall,leadsharedwithrd',
       'notansweringcall,salesremarks:triedtoreachhimmultipletimesbutheisnotrespondingrequesttoshailjatoreconnectwithcustomerwearedroppingthisleadfornow',
       'notansweringcall,sharedleadwithrd',
       'notansweringcall,willtrytoreachhimagain',
       'notansweringcall|ason11thjunecustomerphoneisnotreachableiamdroppingthisleadandforwardingtopartner',
       'notansweringtriedseveraltimesdetailssharedwithrd',
       'notinterested', 'notinterestedatthemoment',
       'notinterestedincommercialtvs', 'notlifted', 'notlifted,calllater',
       'notpickingcalls,calledmanytimes', 'notreachable',
       'notreachable,detailssharedwithrd',
       'notreachable,ihadsharedleadwithpartner',
       'notreachable,leadsharedwithpartner', 'notrequire', 'notrequired',
       'notrequirednow', 'notresponding',
       'notresponding,detailssharedwithlocalrd', 'notrespondingtocalls',
       'notrespondingtriedmanytimes', 'november2022', 'november22',
       'october2022','ordertaken',
       'ourpartner,visnetworksisworkingonthisinquiryandclientiscrosscheckingthepriceshenceclosinginthesystem',
       'partnerisalreadyintouchwithourrd,ornothepartnerneedstocrosscheckontheinformationasthesizeischangedfrom49to50nowclosingtheopprtunityasitsexistingwithrd',
       'partnerisintostaticsignagesheneedtoconnectfordigitalsignagebuthehimselfismanufactureralthoughamailinitaitedforanyrequirementswithledteamtotakeitaheadasofnow,thereisnoscopehenceclosinginsystem',
       "partnerisworkingonexistingrequirementheisconnectedwithourdisti'sforbilling",
       'partnerwasexploringoneolmodelforsomerequiremeentnodesignatedsalesorclosureexpectedhenceclosinginthesystem',
       'phonenumbernotavailable', 'phoneswtichedoff', 'priceshared',
       'pricesharedwithcustomerheislookingforsomeidbsneedtotakefollowup',
       'pricesharewithcustomerourrdisfollowingupwithhimbutcustomerisnotresponding',
       'projectcancle',
       'proposalsubmitted,duetocovidmdisnotcomintoofficeforthemeeting',
       'providedmodelandpriceasperrequirement',
       'purchasedfrombinary@123l', 'purchasedsomelocalpanel,budgetissue',
       'purchasedviewsonicpanel','purchasevubrand',
       'quotationsent–75tr3dj,workinprogress,hewillbuyafter2monthshehasnotevenseenthequoteyet',
       'quotationshared',
       'quotationsharedforultrastrothershand49vl5ghewillcheckwithmanagementandupdateus',
       'quotationsharedincommunication', 'quotationsharedwithcustomer',
       'quotationsharedwithcustomer,hewillconfirmbackafterdiscussingwithmanagement09062022asontoday14062022notansweringcallhenceiamdroppingthisleaddetailsshredwithrdtakeitforfurther',
       'quotationsharedwithhimforidbhedonthaveenoughfundasofnowheisjustevaluatingoptions',
       'quotehasbeensenttocustomer', 'quotesend',
       'quotesendmeetingpending', 'quotesendnextmonthrequire',
       'quotesendrequiredemo', 'quotesendrequiredemoinnextweek',
       'quotesent,theclientisrequireddemoinjune', 'quotesenttocustomer',
       'quotesharedbuttherebudgetisverylow',
       'quotesharedfor86inchheisevaluationlocalbrandoftrueview',
       'quotesharedwithcustomer,hewillconfirmafter2monthsleadsharedwithpartner',
       'quotesharedwithcustomer,leadsharedwithpartnertotakeitfurther',
       'quotesharedwithcustomerhewillrevertwithanupdate',
       'quotesharedwithhim',
       'quotesharedwithhimandhewillupdateusafterdemo',
       'rajnikantisworkinginthiscasehoinahmedabadanditsunderprocess',
       'receivedpo', 'recentlynorequirement,dropped',
       'remarks:customerlookingfor65inchesand75incheseach1qty',
       'repeatedinquiryfromclienthewasjustinquiringbuthavenotconfimredonpurchasewehavetriedmulitpletimesbutclientisnotinterestedinbuying',
       'repeatenquiry',
       'repeatinquiry,wealreadyknewontheclientrequirementweareintouchwiththemandifanyrequirementcomes,wewilludpateasofnow,thetransparentledtheyneedisnotavailablewithusandtheavailableproductcostexpectationisverylowwit',
       'require65inchidbunder80k', 'requiredemo', 'requiredemoafter20may',
       'requiredemopricesend',
       'requirementisforpuneloopedpuneteamalthoughthepartnerischeckingpricesforanexistingcaseclosingtheleadinsystemaspuneteamisalreadyworking',
       'requirementof65tr3dj',
       'requirementof65tr3djandcustomerwantdemo,detailssharedwithrd',
       'requirementof65tr3dj–demoreqired,leadsharedwithrd',
       'requirementof65tr3dj–demorequired',
       'requirementof75tr3dj–demo,detailssharedwithrd',
       'requirementof75tr3dj–demo,leadsharedwithmumbaiteam',
       'requirementof75tr3dj–demorequired',
       'requirementofidb,budgetisverylowshareddetailsonwhatsapp1unitrequiredrdisfollowingup',
       'requirementofvideowallclientischekingonpricesalready2partnersworkingitsongoingcasehenceclisinginsystem',
       'requireonlycmssoftware', 'requirequote',
       'requiresdetailfortendernopurchaserequirementrightnow',
       'resellerwanttohavegemauthorization',
       'reuiredforofficeaskedtosharequotation', 'rnr',
       'sameasleadnoil220300046498hencedroppingduplicatelead',
       'schedulingameeting', 'september', 'september2022',
       'sizenotavailable', 'smallsizetabrequired',
       'spokentoclient,hewillcheckiftheyneeddemoandconfirm',
       'spokentoclient,hewillconnectoncehediscussonrequirement',
       'spokentopartner,havenoanyrequierementsfornow,havemailedthedetailstohim,hewontpurchaseanyskusfornowhenceclosinginthesystem',
       'spokewithcustomehewant43"tv',
       'spokewithcustomer,detailsneedstoshareonmailalsotheywantdemo',
       'spokewithcustomer,heislookingfordotledproductin42inch,ihavesuggestedhimour43inchsignagebuthewantfullyoutdoorledihavesharedthedetailsofour43inchdisplay',
       'spokewithcustomer,willbesharingdetailsonemail',
       'spokewithherandsharedthedetailsforsignagedisplays','theclientbudgetiswithin1lacstheyneed55inchaiosolutionasperdiscussion,theyarecomparingwithjabraandweproposedtocheckiftheyneedonlydisplaysbutduetobudgothersontraint,theyaredroppingtheidea',
       'theclientislookingforsmallsizedisplaysof20inchto25inchtomountonrefthebudgetis4kto6knotexactlyourrequirement',
       'theclientisnothavinganyrequirement,hewasonlybrowsingthroughtheproduthenceclosiginsystemalthoughthedetailsofidbaremailedtoclient',
       'theclientisnothavinganyrequirement,shewasonlybrowsingthroughtheproduthenceclosiginsystemalthoughthedetailsofidbaremailedtoclient',
       'theclientisnothavinganyrequirementhenceclosiginsystemalthoughthedetailsofidbaremailedtoclient',
       'theclientisnothavinganyrequirementhewasjustbrowsingforproductshenceclosiginsystemalthoughthedetailsofidbandvideowallsaremailedtoclient',
       'thepartnerischeckingpriceforanongoingcaseofbarcwehavediscussedonnewcasewiththepartneritsusefultotakeourdiscussionfurtherfornewinquriesclosingtheopportunityinsystemasofnowasitsmadeonlyforcrosscheckingprices',
       'thererquirementisforpuneemailedclientloopingpuneteamtomeetandvisittheclient',
       'theyareconsultantandiamalreadyintouchwithcustomer',
       'theyarehavingrequirementof40displayscustomerneedaggressivepricingthansamsung',
       'treidmanytimesbutphonenotreachable',
       'triedcallingtheclient,noresponsefromclient,ihavemailedidbdetails,willkeepfollowup&update',
       "triedcontacthimbutcouldn'tabletoconnectleadforwardedtord",
       'triedcoupleoftimesbutheisnotrespondingwewilltryagain',
       'triedreachingouttoclientnoresponsetocalllssoihavesharedthedatasheet,shalllfolllllowupandupdate',
       'triedtoreachcustomerbutnoresponseleadforwardedtopartnerforfurtherfollowup',
       'triedtoreachhimbutnoresponseiwillcallhimagain18thnovicalledhimagainandhewillsendtherequirementonmail',
       "triedtoreachhimmultipetimesfromlast7daysbutcouldn'tgetthrough",
       "triedtoreachhimmultipletimes,butcouldn'tgetthru",
       'triedtoreachseveraltimesbutnoresponse', 'underdiscussion','uttarpradeshstatelead',
       'veryabruptcustomersaidtheinquirywasmademonthsagoandwasrudeenoughclosinginthesystemastheclientbehaviourhasnoscopetodiscussonrequirementneedmarketingteamtocheckifthecasewasreceivedindecorjanasperclient',
       'wanttodotradeproductbusiness', 'wanttotakeretailmonitor',
       'wearealreadyintouchwithcustomerthrupartner',
       'wearealreadyintouchwiththiscutsomerfromlast2years,hehasneverpurchasedanyproducttilldateicalledhimuptwicebutnoreponse',
       'wearenotgoingtomatchtechnicalspecification',
       'wehavepitched110inchfuunderprgress',
       'wewerealreadyworkinghere,wearefacingchallengeaswedonthave49vl5ginstockorderlostinmarchduetounavailabiltyof49vl5g',
       'willcomebackonceitisrequired', 'willcomeforthedemo',
       'withdrawnthedecissiontopurchase', 'últimarevisión:26/07/21','underdiscussion,requirementnotdinalisedyet',
       'update14thseptmoredetailstobesharedbyclient',
       'updateaspercustomernoenquirywasmade',
       'updatedetailsalreadysharedwithcustomerevaluationisongoing','uptodecember']

sort_df = sort_df.apply(lambda x: 'others' if x in values else x)
sort_df = sort_df.replace('callbacktomorrow28/09', 'lessthan3months')
sort_df = sort_df.replace('then', 'than')
len(sort_df.unique())

41

In [20]:
values = ['lessthan3months,meetingwiththecustomerforthemoredetailsandtentativeboqwillne32and43',
       'lessthan3monthscustomernotansweredtocallback',
       'lessthan3monthsoutdoorledrequiment', '4/8months', '45days', 'askedtocallbacktomorrow',
          'askedtocallnextweek','askedtocallbacktomorrow,ihadsharedthisenquirywithrd',
         'duplicateleadil220100042906lessthan3months', 'eolmodelnewmodelquoterequirmentafter30days',
         'getbackin1wk','heislookingfordisplayforhisrestauranthehasaskedtocontacthimafter1monthasrestaurantworkisunderprogress',
        'hewantdemonextmonthatpune15may', 'hewantdemonextweek','needtodiscusswithclientinnexttwomonthstheyneedtochecktheproductandaccridnglyproceedforpersonaluse'
          ,'meetingplannednextweek','needtocallonfriday27th', 'onemonth', 'spokewithshubhamwhoisstudenthesaidhewillspeakwithhissirandrevertbackafter2weeks', '3months',
         'update14thseptcustomerwilldiscussindetailtomorrow','update13thspetfollowuptobedoneon15thsept', 'update27thaugcustomerhastocomeforproductdemo',
       'update4thaugcustomeraskedtocallonmonday',
       'update4thaugcustomervisitingexperiencecentreon8thsept',
       'update4thaugdetaileddiscussiontobedoneon6thaug',
       'update7thaugdemogivencustomerwillconfirmnextweek',
       'update8thseptdetailssharedwillconfirmdemoscheduleofnextweek',
       'update9thoctdetailssharedwillconfirmforonlinedemonextweek', 'updatecalltobemadeontuesday',]

sort_df = sort_df.apply(lambda x: 'lessthan3months' if x in values else x)

In [21]:
sort_df = sort_df.replace('lessthen6months', 'lessthan6months')
sort_df = sort_df.replace('lessthan5months', 'lessthan6months')
sort_df = sort_df.replace('morethen3months', 'lessthan6months')
sort_df = sort_df.replace('needshoteltvafter4months,willcallus', 'lessthan6months')
sort_df = sort_df.replace('purchaseplanningafter3months', 'lessthan6months')
sort_df = sort_df.replace('lessthan6months', '3months6months')

In [22]:
len(sort_df.unique())

6

In [23]:
df_all['expected_timeline'] = sort_df

In [24]:
df_all['expected_timeline'].unique()

array(['lessthan3months', 'others', '3months6months', '9months1year',
       'morethanayear', '6months9months'], dtype=object)

In [25]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64570 entries, 0 to 5270
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              64570 non-null  float64
 1   business_unit            64570 non-null  object 
 2   com_reg_ver_win_rate     64570 non-null  float64
 3   customer_idx             64570 non-null  int64  
 4   customer_type            64570 non-null  object 
 5   enterprise               64570 non-null  object 
 6   historical_existing_cnt  64570 non-null  float64
 7   customer_job             64570 non-null  object 
 8   lead_desc_length         64570 non-null  int64  
 9   inquiry_type             64570 non-null  object 
 10  product_category         64570 non-null  object 
 11  customer_position        64570 non-null  object 
 12  response_corporate       64570 non-null  object 
 13  expected_timeline        64570 non-null  object 
 14  ver_win_rate_x           645

In [26]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "country",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [27]:
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [28]:
sample = df_train[df_train['customer_idx'] == 25096].sample(100, random_state=42)
# df_train = df_train[df_train['customer_idx'] != 25096]
df_train = pd.concat([df_train[df_train['customer_idx'] != 25096], sample])

In [29]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler

X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_resample(df_train.drop(["is_converted", 'id'], axis=1), df_train["is_converted"].astype(int))
y_resampled.sum(), len(y_resampled)

(2529, 5058)

In [30]:
x_train, x_val, y_train, y_val = train_test_split(
    X_resampled,
    y_resampled,
    stratify=y_resampled,
    test_size=0.2,
    random_state=42,
)

In [31]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

In [33]:
lgbm = LGBMClassifier(random_state = 42, verbose = 0)
param_grid = {'learning_rate': np.arange(0.03, 0.13, 0.02), 'n_estimators': [50, 100, 150, 200],
              'colsample_bytree': [0.8],'subsample': [0.8]}


grid_lgbm = GridSearchCV(lgbm, param_grid, verbose = True, scoring='f1')
grid_lgbm.fit(x_train, y_train)

print('최적의 파라미터 :', grid_lgbm.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
최적의 파라미터 : {'colsample_bytree': 0.8, 'learning_rate': 0.09000000000000001, 'n_estimators': 150, 'subsample': 0.8}


In [34]:
y_pred = grid_lgbm.predict(x_val)
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_lgbm.predict(x_test)
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
       pred_T[0]  pred_F[1]
T[0]        443         63
F[1]         24        482

정확도: 0.9140
정밀도: 0.8844
재현율: 0.9526
F1: 0.9172


(2310, 5271)

In [36]:
cat = CatBoostClassifier(random_state=42, verbose = False)
param_grid = {'iterations' : [800, 900, 1000, 1100], 'learning_rate' : np.arange(0.01, 0.11, 0.02),
             'subsample' : [0.8], 'colsample_bylevel' : [0.8]}

grid_cat = GridSearchCV(cat, param_grid, verbose = True, scoring='f1')
grid_cat.fit(x_train, y_train.astype(int))

print('최적의 파라미터 :', grid_cat.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
최적의 파라미터 : {'colsample_bylevel': 0.8, 'iterations': 900, 'learning_rate': 0.049999999999999996, 'subsample': 0.8}


In [37]:
y_pred = grid_cat.predict(x_val)
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = grid_cat.predict(x_test)
sum(test_pred), len(test_pred) # True로 예측된 개수

오차행렬:
       pred_T[0]  pred_F[1]
T[0]        442         64
F[1]         28        478

정확도: 0.9091
정밀도: 0.8819
재현율: 0.9447
F1: 0.9122


(2301, 5271)

In [40]:
cat = CatBoostClassifier(random_state=42, verbose = False, colsample_bylevel = 0.8, 
                         iterations = 900, learning_rate = 0.05, subsample = 0.8)
cat.fit(x_train, y_train.astype(int))

lgbm = LGBMClassifier(random_state = 42, verbose = 0, colsample_bytree = 0.8,
                      learning_rate = 0.09, n_estimators = 150, subsample = 0.8)
lgbm.fit(x_train, y_train)

In [41]:
vote = VotingClassifier(estimators= [('cat', cat), ('lgbm', lgbm)], voting= 'soft')
vote.fit(x_train, y_train)

y_pred = vote.predict(x_val)
get_clf_eval(y_val, y_pred)

x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = vote.predict(x_test)
sum(test_pred), len(test_pred)

오차행렬:
       pred_T[0]  pred_F[1]
T[0]        444         62
F[1]         21        485

정확도: 0.9180
정밀도: 0.8867
재현율: 0.9585
F1: 0.9212


(2281, 5271)

In [42]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)