In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
path = '../../All_data_sets/model_data_set/random_forest/Travel.csv'

In [3]:
df = pd.read_csv(path)

df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [4]:
df.rename(columns=str.lower,inplace=True)

In [5]:
columns = df.columns

In [6]:
df['gender'].value_counts()

gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [7]:
df['gender'] = df['gender'].replace(to_replace='Fe Male' ,value='Female' )

In [8]:
df['maritalstatus'].value_counts()

maritalstatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [9]:
df['maritalstatus'] = df['maritalstatus'].replace(to_replace='Unmarried' ,value='Single' )

In [10]:
# columns with null values

features_with_na = [feature for feature in columns if df[feature].isnull().sum()>=1 ]

features_with_na

['age',
 'typeofcontact',
 'durationofpitch',
 'numberoffollowups',
 'preferredpropertystar',
 'numberoftrips',
 'numberofchildrenvisiting',
 'monthlyincome']

In [11]:
for feature in features_with_na:
    print(f'{feature} : { np.round( df[feature].isnull().mean()*100 , 3 ) } % missing values!' )

age : 4.624 % missing values!
typeofcontact : 0.511 % missing values!
durationofpitch : 5.135 % missing values!
numberoffollowups : 0.921 % missing values!
preferredpropertystar : 0.532 % missing values!
numberoftrips : 2.864 % missing values!
numberofchildrenvisiting : 1.35 % missing values!
monthlyincome : 4.767 % missing values!


In [12]:
df[features_with_na].select_dtypes( exclude='object' ).describe()

Unnamed: 0,age,durationofpitch,numberoffollowups,preferredpropertystar,numberoftrips,numberofchildrenvisiting,monthlyincome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


In [13]:
df['age'] = df['age'].fillna( df['age'].median() )
df['typeofcontact'] = df['typeofcontact'].fillna( df['typeofcontact'].mode()[0] )

In [14]:
df['durationofpitch'] = df['durationofpitch'].fillna( df['durationofpitch'].median() )

df['numberoffollowups'] = df['numberoffollowups'].fillna( df['numberoffollowups'].mode()[0] )

In [15]:
df['preferredpropertystar'] = df['preferredpropertystar'].fillna( df['preferredpropertystar'].mode()[0] )

df['numberoftrips'] = df['numberoftrips'].fillna( 0 )

In [16]:
df['numberofchildrenvisiting'] = df['numberofchildrenvisiting'].fillna( df['numberofchildrenvisiting'].mode()[0] )

df['monthlyincome'] = df['monthlyincome'].fillna( df['monthlyincome'].median() )

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customerid                4888 non-null   int64  
 1   prodtaken                 4888 non-null   int64  
 2   age                       4888 non-null   float64
 3   typeofcontact             4888 non-null   object 
 4   citytier                  4888 non-null   int64  
 5   durationofpitch           4888 non-null   float64
 6   occupation                4888 non-null   object 
 7   gender                    4888 non-null   object 
 8   numberofpersonvisiting    4888 non-null   int64  
 9   numberoffollowups         4888 non-null   float64
 10  productpitched            4888 non-null   object 
 11  preferredpropertystar     4888 non-null   float64
 12  maritalstatus             4888 non-null   object 
 13  numberoftrips             4888 non-null   float64
 14  passport

In [18]:
#create the new column with the visiting peaple

df['total_visiting'] = df['numberofpersonvisiting']+df['numberofchildrenvisiting']


In [19]:

delete_col = ['customerid', 'numberofpersonvisiting','numberofchildrenvisiting' ]

In [20]:
df.drop(columns=delete_col , inplace=True)

In [21]:
#numerical and categorical featutres

num_cols = [ feature for feature in df.columns if df[feature].dtype != 'O' ]

cat_cols = [ feature for feature in df.columns if df[feature].dtype == 'O' ]

print(f'Numerical columns:{num_cols}')

print('-------------------------------------------------------------------------------------------------------------------------------------------')
print(f'Categorical columns:{cat_cols}')

Numerical columns:['prodtaken', 'age', 'citytier', 'durationofpitch', 'numberoffollowups', 'preferredpropertystar', 'numberoftrips', 'passport', 'pitchsatisfactionscore', 'owncar', 'monthlyincome', 'total_visiting']
-------------------------------------------------------------------------------------------------------------------------------------------
Categorical columns:['typeofcontact', 'occupation', 'gender', 'productpitched', 'maritalstatus', 'designation']


In [22]:
# Descrite features

descrite_features = [ feature for feature in df.columns if len( df[feature].unique() ) <= 25]

descrite_features

['prodtaken',
 'typeofcontact',
 'citytier',
 'occupation',
 'gender',
 'numberoffollowups',
 'productpitched',
 'preferredpropertystar',
 'maritalstatus',
 'numberoftrips',
 'passport',
 'pitchsatisfactionscore',
 'owncar',
 'designation',
 'total_visiting']

In [23]:
# Continuous features


continuous_features = [ feature for feature in df.columns if len( df[feature].unique() ) > 25]

continuous_features

['age', 'durationofpitch', 'monthlyincome']

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X = df.drop( columns='prodtaken' , axis=1  )

y = df['prodtaken']

In [26]:
X_train , X_test , y_train , y_test = train_test_split( X , y , test_size=0.25 , random_state=42 )

In [27]:
X_train.shape , X_test.shape

((3666, 17), (1222, 17))

In [28]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler

from sklearn.compose import ColumnTransformer

In [29]:
scaler = StandardScaler()

encoder = OneHotEncoder( drop='first' )

In [30]:
cat_cols = X.select_dtypes(include='object').columns

num_cols = X.select_dtypes(exclude='object').columns

In [31]:
preprocessor = ColumnTransformer(
    [
        ( 'OneHotEncoder' , encoder, cat_cols  ),
        ( 'StandardScaler' , scaler, num_cols )
    ]
)

In [32]:
X_train_processed = preprocessor.fit_transform( X_train )

In [33]:
X_test_processed = preprocessor.transform( X_test )

In [34]:
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

In [35]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)

In [36]:
clf.fit( X_train_processed , y_train ) 

In [37]:
y_pred = clf.predict( X_test_processed )

In [38]:

print('Confusion matrix')
print(confusion_matrix(y_test , y_pred ))

print(f'Accuracy score is:{accuracy_score(y_test , y_pred)}')

print('Classification Report')
print(classification_report(y_test , y_pred ))

Confusion matrix
[[952  47]
 [146  77]]
Accuracy score is:0.8420621931260229
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       999
           1       0.62      0.35      0.44       223

    accuracy                           0.84      1222
   macro avg       0.74      0.65      0.68      1222
weighted avg       0.82      0.84      0.82      1222

