In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [28]:
df=pd.read_csv('Travel.csv')
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [29]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [30]:
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [31]:
df['Gender']=df['Gender'].replace('Fe Male','Female')
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [32]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [33]:
df['MaritalStatus']=df['MaritalStatus'].replace('Single','Unmarried')
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Unmarried    1598
Divorced      950
Name: count, dtype: int64

In [34]:
features_with_null=[features for features in df.columns if df[features].isnull().sum()>0]
for feature in features_with_null:
    print(feature,np.round(df[feature].isnull().mean()*100,5), "missing values")

Age 4.62357 missing values
TypeofContact 0.51146 missing values
DurationOfPitch 5.13502 missing values
NumberOfFollowups 0.92062 missing values
PreferredPropertyStar 0.53191 missing values
NumberOfTrips 2.86416 missing values
NumberOfChildrenVisiting 1.35025 missing values
MonthlyIncome 4.76678 missing values


In [35]:
df['Age'].fillna(df['Age'].median(),inplace=True)
df['TypeofContact'].fillna(df['TypeofContact'].mode()[0],inplace=True)
df['DurationOfPitch'].fillna(df['DurationOfPitch'].median(),inplace=True)
df['NumberOfFollowups'].fillna(df['NumberOfFollowups'].mode()[0],inplace=True)
df['PreferredPropertyStar'].fillna(df['PreferredPropertyStar'].mode()[0],inplace=True)
df['NumberOfTrips'].fillna(df['NumberOfTrips'].median(),inplace=True)
df['NumberOfChildrenVisiting'].fillna(df['NumberOfChildrenVisiting'].mode()[0],inplace=True)
df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(),inplace=True)



In [36]:
df.drop('CustomerID',axis=1,inplace=True)

#### Feature Engineering

In [37]:
df['TotalVisits']=df['NumberOfTrips']+df['NumberOfChildrenVisiting']
df.drop(['NumberOfTrips','NumberOfChildrenVisiting'],axis=1,inplace=True)

In [38]:
numerical_features=[features for features in df.columns if df[features].dtype!='O']
categorical_features=[features for features in df.columns if df[features].dtype=='O']
print("Numerical features:",numerical_features)
print("Categorical features:",categorical_features)    

Numerical features: ['ProdTaken', 'Age', 'CityTier', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore', 'OwnCar', 'MonthlyIncome', 'TotalVisits']
Categorical features: ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation']


In [39]:
discrete_features=[features for features in numerical_features if len(df[features].unique())<25]
continuous_features=[features for features in numerical_features if features not in discrete_features]
print(len(discrete_features))

9


#### Train-Test split

In [40]:
from sklearn.model_selection import train_test_split
X=df.drop('ProdTaken',axis=1)
y=df['ProdTaken']

In [41]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape

((3910, 17), (978, 17))

In [42]:
cat_features=X_train.select_dtypes(include=['O']).columns
num_features=X_train.select_dtypes(exclude=['O']).columns
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

onehot=OneHotEncoder(drop='first')
scaler=StandardScaler()
preprocessor=ColumnTransformer(
    transformers=[
        ('OneHotEncoder',scaler,num_features),
        ('StandardScaler',onehot,cat_features)
    ])

In [43]:
X_train_processed=preprocessor.fit_transform(X_train)
X_test_processed=preprocessor.transform(X_test)

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,r2_score


In [45]:
models={
    'RandomForest':RandomForestClassifier(random_state=42),
}
for model_name,model in models.items():
    model.fit(X_train_processed,y_train)
    y_pred=model.predict(X_test_processed)
    print(f"Model: {model_name}")
    print("Accuracy:",accuracy_score(y_test,y_pred))
    print("r2_score:",r2_score(y_test,y_pred))
    print("Classification Report:\n",classification_report(y_test,y_pred))
    print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))


Model: RandomForest
Accuracy: 0.9222903885480572
r2_score: 0.5055249905200342
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95       787
           1       0.94      0.64      0.76       191

    accuracy                           0.92       978
   macro avg       0.93      0.82      0.86       978
weighted avg       0.92      0.92      0.92       978

Confusion Matrix:
 [[779   8]
 [ 68 123]]


In [46]:
hyperparameters={   
    'n_estimators':[100,200,300],
    'max_depth':[None,10,20,30],
    'min_samples_split':[2,5,10],
    'max_features':[5,7,'auto',8]
}

In [47]:
random_cv_models={
    'RandomForest':RandomForestClassifier(random_state=42),
}
from sklearn.model_selection import RandomizedSearchCV
for model_name,model in random_cv_models.items():
    random_cv=RandomizedSearchCV(estimator=model,
                                 param_distributions=hyperparameters,
                                 n_iter=10,
                                 cv=5,
                                 verbose=2,
                                 n_jobs=-1,
                                 scoring='accuracy',
                                 random_state=42)
    random_cv.fit(X_train_processed,y_train)
    best_model=random_cv.best_estimator_
    y_pred=best_model.predict(X_test_processed)
    print(f"Model: {model_name}")
    print("Best Hyperparameters:",random_cv.best_params_)
    print("Accuracy:",accuracy_score(y_test,y_pred))
    print("r2_score:",r2_score(y_test,y_pred))
    print("Classification Report:\n",classification_report(y_test,y_pred))
    print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=None, max_features=auto, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END max_depth=20, ma

0    1
1    0
2    1
3    0
4    0
Name: ProdTaken, dtype: int64