In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('dark')

In [2]:
test_data=pd.read_csv('./data/test.csv')
test_data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,32.0,Company Invited,3,,Small Business,Male,2,5.0,Deluxe,3.0,Married,1.0,0,2,0,1.0,Manager,19668.0
1,2,46.0,Self Enquiry,2,11.0,Small Business,Male,3,,Deluxe,4.0,Married,1.0,1,5,0,1.0,Manager,20021.0
2,3,37.0,Self Enquiry,3,22.0,Small Business,Male,3,4.0,Deluxe,3.0,Married,5.0,0,5,1,0.0,Manager,21334.0
3,4,43.0,Self Enquiry,1,36.0,Small Business,Male,3,6.0,Deluxe,3.0,Unmarried,6.0,0,3,1,2.0,Manager,22950.0
4,5,25.0,Self Enquiry,3,7.0,Large Business,Female,4,4.0,Basic,4.0,Unmarried,3.0,1,4,1,3.0,Executive,21880.0


In [3]:
median_strategy_attribs=\
    [
        'Age',
        'CityTier',
        'DurationOfPitch',
        'NumberOfFollowups',
        'NumberOfPersonVisiting',
        'PitchSatisfactionScore',
        'MonthlyIncome'
    ]
zero_replacement_attribs=\
    [
        'NumberOfTrips',
        'NumberOfChildrenVisiting'
    ]

# categorical 처리
mode_strategy_attribs=\
    [
        'TypeofContact',
        'Occupation',
        'Gender',
        'ProductPitched',
        'PreferredPropertyStar',
        'MaritalStatus',
        'Passport',
        'OwnCar',
        'Designation',
    ]

attribs=median_strategy_attribs+zero_replacement_attribs+mode_strategy_attribs

In [4]:
from textwrap import fill
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

pipeline=ColumnTransformer([
    ('median',SimpleImputer(strategy='median'),median_strategy_attribs),
    ('zero',SimpleImputer(strategy='constant',fill_value=0),zero_replacement_attribs),
    ('mode',SimpleImputer(strategy='most_frequent'),mode_strategy_attribs)
])

transformed=pipeline.fit_transform(test_data)

data_filled_na=pd.DataFrame(transformed,columns=attribs)
data_filled_na.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfFollowups,NumberOfPersonVisiting,PitchSatisfactionScore,MonthlyIncome,NumberOfTrips,NumberOfChildrenVisiting,TypeofContact,Occupation,Gender,ProductPitched,PreferredPropertyStar,MaritalStatus,Passport,OwnCar,Designation
0,32.0,3.0,13.0,5.0,2.0,2.0,19668.0,1.0,1.0,Company Invited,Small Business,Male,Deluxe,3.0,Married,0,0,Manager
1,46.0,2.0,11.0,4.0,3.0,5.0,20021.0,1.0,1.0,Self Enquiry,Small Business,Male,Deluxe,4.0,Married,1,0,Manager
2,37.0,3.0,22.0,4.0,3.0,5.0,21334.0,5.0,0.0,Self Enquiry,Small Business,Male,Deluxe,3.0,Married,0,1,Manager
3,43.0,1.0,36.0,6.0,3.0,3.0,22950.0,6.0,2.0,Self Enquiry,Small Business,Male,Deluxe,3.0,Unmarried,0,1,Manager
4,25.0,3.0,7.0,4.0,4.0,4.0,21880.0,3.0,3.0,Self Enquiry,Large Business,Female,Basic,4.0,Unmarried,1,1,Executive


### gender label correction and dummy

In [5]:
data_filled_na.Gender.unique()

array(['Male', 'Female', 'Fe Male'], dtype=object)

In [6]:
data_filled_na['Gender']=data_filled_na['Gender'].replace({'Fe Male':'Female'})

In [7]:
data_filled_na=\
    pd.concat([
        data_filled_na,
        pd.get_dummies(data_filled_na['Gender'])],axis=1)

In [8]:
data_filled_na=data_filled_na.drop('Gender',axis=1,inplace=False)

In [9]:
data_filled_na=data_filled_na.reset_index(drop=True)

In [10]:
data_filled_na=\
    pd.concat([
    data_filled_na,pd.get_dummies(data_filled_na['TypeofContact'])
    ],axis=1)
data_filled_na=data_filled_na.drop('TypeofContact',axis=1,inplace=False)
data_filled_na.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfFollowups,NumberOfPersonVisiting,PitchSatisfactionScore,MonthlyIncome,NumberOfTrips,NumberOfChildrenVisiting,Occupation,ProductPitched,PreferredPropertyStar,MaritalStatus,Passport,OwnCar,Designation,Female,Male,Company Invited,Self Enquiry
0,32.0,3.0,13.0,5.0,2.0,2.0,19668.0,1.0,1.0,Small Business,Deluxe,3.0,Married,0,0,Manager,0,1,1,0
1,46.0,2.0,11.0,4.0,3.0,5.0,20021.0,1.0,1.0,Small Business,Deluxe,4.0,Married,1,0,Manager,0,1,0,1
2,37.0,3.0,22.0,4.0,3.0,5.0,21334.0,5.0,0.0,Small Business,Deluxe,3.0,Married,0,1,Manager,0,1,0,1
3,43.0,1.0,36.0,6.0,3.0,3.0,22950.0,6.0,2.0,Small Business,Deluxe,3.0,Unmarried,0,1,Manager,0,1,0,1
4,25.0,3.0,7.0,4.0,4.0,4.0,21880.0,3.0,3.0,Large Business,Basic,4.0,Unmarried,1,1,Executive,1,0,0,1


In [11]:
data_filled_na=\
    pd.concat([
    data_filled_na,pd.get_dummies(data_filled_na['Occupation'])
    ],axis=1)
data_filled_na=data_filled_na.drop('Occupation',axis=1,inplace=False)
data_filled_na.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfFollowups,NumberOfPersonVisiting,PitchSatisfactionScore,MonthlyIncome,NumberOfTrips,NumberOfChildrenVisiting,ProductPitched,...,OwnCar,Designation,Female,Male,Company Invited,Self Enquiry,Free Lancer,Large Business,Salaried,Small Business
0,32.0,3.0,13.0,5.0,2.0,2.0,19668.0,1.0,1.0,Deluxe,...,0,Manager,0,1,1,0,0,0,0,1
1,46.0,2.0,11.0,4.0,3.0,5.0,20021.0,1.0,1.0,Deluxe,...,0,Manager,0,1,0,1,0,0,0,1
2,37.0,3.0,22.0,4.0,3.0,5.0,21334.0,5.0,0.0,Deluxe,...,1,Manager,0,1,0,1,0,0,0,1
3,43.0,1.0,36.0,6.0,3.0,3.0,22950.0,6.0,2.0,Deluxe,...,1,Manager,0,1,0,1,0,0,0,1
4,25.0,3.0,7.0,4.0,4.0,4.0,21880.0,3.0,3.0,Basic,...,1,Executive,1,0,0,1,0,1,0,0


In [12]:
data_filled_na=\
    pd.concat([
    data_filled_na,pd.get_dummies(data_filled_na['ProductPitched'])
    ],axis=1)
data_filled_na=data_filled_na.drop('ProductPitched',axis=1,inplace=False)
data_filled_na.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfFollowups,NumberOfPersonVisiting,PitchSatisfactionScore,MonthlyIncome,NumberOfTrips,NumberOfChildrenVisiting,PreferredPropertyStar,...,Self Enquiry,Free Lancer,Large Business,Salaried,Small Business,Basic,Deluxe,King,Standard,Super Deluxe
0,32.0,3.0,13.0,5.0,2.0,2.0,19668.0,1.0,1.0,3.0,...,0,0,0,0,1,0,1,0,0,0
1,46.0,2.0,11.0,4.0,3.0,5.0,20021.0,1.0,1.0,4.0,...,1,0,0,0,1,0,1,0,0,0
2,37.0,3.0,22.0,4.0,3.0,5.0,21334.0,5.0,0.0,3.0,...,1,0,0,0,1,0,1,0,0,0
3,43.0,1.0,36.0,6.0,3.0,3.0,22950.0,6.0,2.0,3.0,...,1,0,0,0,1,0,1,0,0,0
4,25.0,3.0,7.0,4.0,4.0,4.0,21880.0,3.0,3.0,4.0,...,1,0,1,0,0,1,0,0,0,0


In [13]:
data_filled_na=\
    pd.concat([
    data_filled_na,pd.get_dummies(data_filled_na['MaritalStatus'])
    ],axis=1)
data_filled_na=data_filled_na.drop('MaritalStatus',axis=1,inplace=False)
data_filled_na.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfFollowups,NumberOfPersonVisiting,PitchSatisfactionScore,MonthlyIncome,NumberOfTrips,NumberOfChildrenVisiting,PreferredPropertyStar,...,Small Business,Basic,Deluxe,King,Standard,Super Deluxe,Divorced,Married,Single,Unmarried
0,32.0,3.0,13.0,5.0,2.0,2.0,19668.0,1.0,1.0,3.0,...,1,0,1,0,0,0,0,1,0,0
1,46.0,2.0,11.0,4.0,3.0,5.0,20021.0,1.0,1.0,4.0,...,1,0,1,0,0,0,0,1,0,0
2,37.0,3.0,22.0,4.0,3.0,5.0,21334.0,5.0,0.0,3.0,...,1,0,1,0,0,0,0,1,0,0
3,43.0,1.0,36.0,6.0,3.0,3.0,22950.0,6.0,2.0,3.0,...,1,0,1,0,0,0,0,0,0,1
4,25.0,3.0,7.0,4.0,4.0,4.0,21880.0,3.0,3.0,4.0,...,0,1,0,0,0,0,0,0,0,1


In [14]:
data_filled_na=\
    pd.concat([
    data_filled_na,pd.get_dummies(data_filled_na['Designation'])
    ],axis=1)
data_filled_na=data_filled_na.drop('Designation',axis=1,inplace=False)
data_filled_na.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfFollowups,NumberOfPersonVisiting,PitchSatisfactionScore,MonthlyIncome,NumberOfTrips,NumberOfChildrenVisiting,PreferredPropertyStar,...,Super Deluxe,Divorced,Married,Single,Unmarried,AVP,Executive,Manager,Senior Manager,VP
0,32.0,3.0,13.0,5.0,2.0,2.0,19668.0,1.0,1.0,3.0,...,0,0,1,0,0,0,0,1,0,0
1,46.0,2.0,11.0,4.0,3.0,5.0,20021.0,1.0,1.0,4.0,...,0,0,1,0,0,0,0,1,0,0
2,37.0,3.0,22.0,4.0,3.0,5.0,21334.0,5.0,0.0,3.0,...,0,0,1,0,0,0,0,1,0,0
3,43.0,1.0,36.0,6.0,3.0,3.0,22950.0,6.0,2.0,3.0,...,0,0,0,0,1,0,0,1,0,0
4,25.0,3.0,7.0,4.0,4.0,4.0,21880.0,3.0,3.0,4.0,...,0,0,0,0,1,0,1,0,0,0


In [15]:
obj_to_flt_list=[
    'Age',
    'CityTier',
    'DurationOfPitch',
    'NumberOfFollowups',
    'NumberOfPersonVisiting',
    'PitchSatisfactionScore',
    'NumberOfTrips',
    'NumberOfChildrenVisiting',
    'PreferredPropertyStar',
    'Passport',
    'OwnCar',
    'MonthlyIncome'
    ]

data_filled_na[obj_to_flt_list]=\
    data_filled_na[obj_to_flt_list].astype('float')

In [16]:
data_filled_na['MonthlyIncomeLogScale']=\
    np.log(data_filled_na['MonthlyIncome'])
data_filled_na.drop('MonthlyIncome',axis=1,inplace=True)

In [17]:
test_data_prepared=data_filled_na.copy()

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf=RandomForestClassifier()

In [27]:
submission=pd.read_csv('./submission/sample_submission.csv')
submission

Unnamed: 0,id,ProdTaken
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
2928,2929,0
2929,2930,0
2930,2931,0
2931,2932,0


## Stacking

In [19]:
import pickle

In [20]:
xtree_clf=pickle.load(open('xtree_clf.pkl','rb'))
rf_clf=pickle.load(open('rf_clf.pkl','rb'))
lgbm_clf=pickle.load(open('lgbm_clf.pkl','rb'))
ada_clf=pickle.load(open('ada_clf.pkl','rb'))

In [21]:
lr_final=pickle.load(open('stack_model.pkl','rb'))

In [22]:
xtree_pred=xtree_clf.predict(test_data_prepared)
rf_pred=rf_clf.predict(test_data_prepared)
lgbm_pred=lgbm_clf.predict(test_data_prepared)
ada_pred=ada_clf.predict(test_data_prepared)

In [23]:
pred=np.array([xtree_pred,rf_pred,lgbm_pred,ada_pred])
pred=np.transpose(pred)

In [24]:
pred.shape

(2933, 4)

In [25]:
prediction=lr_final.predict(pred)

In [28]:
submission['ProdTaken']=prediction
submission

Unnamed: 0,id,ProdTaken
0,1,1
1,2,0
2,3,0
3,4,0
4,5,1
...,...,...
2928,2929,0
2929,2930,0
2930,2931,0
2931,2932,0


In [29]:
submission.to_csv('./submission/stacking_model_0831.csv',index=False)