# Holiday Package Prediction --> Random Forest Classification

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [17]:
df = pd.read_csv("Travel.csv")
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


## Data Cleaning
#### Handling Missing Values

1. Handling Missing Values
2. Handling Duplicates
3. Check Data types
4. Understanding the dataset

In [18]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [19]:
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [20]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [21]:
df['TypeofContact'].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [22]:
df['Gender']= df['Gender'].replace('Fe Male', 'Female') # replacing gender typo for fe male to female 
df['MaritalStatus'] = df['MaritalStatus'].replace('Single', 'Unmarried') # replacing marital status typo for single to unmarried

In [23]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [24]:
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [25]:
##Checking missing values
features_with_na = [features for features in df.columns if df[features].isnull().sum() >= 1]
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean()*100, 5), "% of missing value") # finding percentage of missing values in each feature

Age 4.62357 % of missing value
TypeofContact 0.51146 % of missing value
DurationOfPitch 5.13502 % of missing value
NumberOfFollowups 0.92062 % of missing value
PreferredPropertyStar 0.53191 % of missing value
NumberOfTrips 2.86416 % of missing value
NumberOfChildrenVisiting 1.35025 % of missing value
MonthlyIncome 4.76678 % of missing value


In [26]:
# statistic on numerical columns (Null Columns)
df[features_with_na].select_dtypes(exclude='object').describe() # describing numerical columns with null values 

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


##### From this we can get what values to fill through numerical columns against null

In [27]:
# age
df.Age.fillna(df.Age.median(), inplace = True)
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace = True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace= True)
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace= True)

In [28]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [29]:
df.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [30]:
df.drop('CustomerID', inplace= True, axis= 1)

### Feature Engineering 

In [31]:
# Dropping columns and making them one 
df['TotalVisiting'] = df['NumberOfChildrenVisiting'] + df['NumberOfPersonVisiting']
df.drop(columns=['NumberOfChildrenVisiting', 'NumberOfPersonVisiting'] ,inplace = True, axis=1)

In [32]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisiting
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Unmarried,7.0,1,3,0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [33]:
# get all neumeric features
numeric_Features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical features: ', len(numeric_Features))

Num of Numerical features:  12


In [34]:
# get all categorical features
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print("Num of Categorical Features: ", len(cat_features))

Num of Categorical Features:  6


In [35]:
# Discreate features
discrete_features = [feature for feature in numeric_Features if len(df[feature].unique()) <= 25]
print("Num of discrete features: ", len(discrete_features))

Num of discrete features:  9


In [36]:
# continious features
continuous_features = [feature for feature in numeric_Features if feature not in discrete_features]
print("Num of continious feature: ", len(continuous_features))

Num of continious feature:  3


### Train test split and model training

In [37]:
from sklearn.model_selection import train_test_split
X = df.drop(['ProdTaken'], axis=1)
y = df['ProdTaken']

In [38]:
X.head()

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisiting
0,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,Manager,20993.0,3.0
1,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Unmarried,7.0,1,3,0,Executive,17090.0,3.0
3,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [39]:
y.value_counts()#Product taken

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=43)
X_train.shape, X_test.shape

((3666, 17), (1222, 17))

In [41]:
X.info()#Whereever we have categorical feature it marked as object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     4888 non-null   float64
 1   TypeofContact           4888 non-null   object 
 2   CityTier                4888 non-null   int64  
 3   DurationOfPitch         4888 non-null   float64
 4   Occupation              4888 non-null   object 
 5   Gender                  4888 non-null   object 
 6   NumberOfFollowups       4888 non-null   float64
 7   ProductPitched          4888 non-null   object 
 8   PreferredPropertyStar   4888 non-null   float64
 9   MaritalStatus           4888 non-null   object 
 10  NumberOfTrips           4888 non-null   float64
 11  Passport                4888 non-null   int64  
 12  PitchSatisfactionScore  4888 non-null   int64  
 13  OwnCar                  4888 non-null   int64  
 14  Designation             4888 non-null   

In [42]:
cat_features = X.select_dtypes(include="object").columns#store all object datatype feature
num_features = X.select_dtypes(exclude="object").columns#store all neumerical datatype feature

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop = 'first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StanderedScaler", numeric_transformer, num_features) 
    ]
)


In [43]:
X_train = preprocessor.fit_transform(X_train)

In [44]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.702141,-0.775300,0.286971,-0.716411,2.554841,-0.650065,-0.053856,-1.267832,-0.095592,-0.06322
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.702141,-1.019238,1.293705,-0.716411,-0.140424,-0.650065,-0.787818,0.788748,-0.541153,0.64554
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.393199,1.420146,0.286971,0.535430,-0.679478,1.538308,-0.053856,-1.267832,-0.514701,1.35430
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.488538,-0.165454,-0.719763,1.787271,-0.140424,1.538308,-0.053856,-1.267832,0.133848,-0.06322
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-0.702141,-0.531361,0.286971,-0.716411,-0.140424,-0.650065,0.680106,0.788748,-0.445475,1.35430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3661,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.702141,1.908023,0.286971,0.535430,0.398629,-0.650065,-0.053856,0.788748,0.655014,-1.48074
3662,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-0.702141,1.664085,-2.733231,-0.716411,-0.679478,-0.650065,-1.521781,0.788748,-1.101153,-0.77198
3663,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,-0.702141,-1.019238,0.286971,-0.716411,-0.679478,-0.650065,-1.521781,-1.267832,-0.393133,-1.48074
3664,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-0.702141,-0.409392,0.286971,-0.716411,-0.679478,-0.650065,-1.521781,0.788748,-1.131357,-0.77198


In [45]:
X_test = preprocessor.transform(X_test)

In [46]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.488538,-0.287423,-0.719763,-0.716411,0.937682,-0.650065,1.414069,0.788748,0.119027,-0.06322
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.488538,2.273931,0.286971,-0.716411,-0.140424,-0.650065,-0.053856,-1.267832,0.515436,-0.06322
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-0.702141,0.688331,-0.719763,-0.716411,-0.679478,-0.650065,0.680106,0.788748,-1.118037,-0.77198
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.488538,-0.653331,0.286971,0.535430,-0.140424,-0.650065,1.414069,0.788748,1.917217,-1.48074
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,-0.702141,-0.531361,0.286971,0.535430,-0.140424,-0.650065,0.680106,0.788748,-0.083398,-0.06322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1217,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-0.702141,1.420146,0.286971,-0.716411,-0.679478,-0.650065,0.680106,0.788748,-0.491438,2.06306
1218,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.488538,-0.043484,0.286971,-0.716411,-0.140424,-0.650065,-0.787818,0.788748,0.719362,-0.06322
1219,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.488538,1.664085,0.286971,-0.716411,-1.218531,1.538308,-0.787818,0.788748,-0.229354,-1.48074
1220,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.702141,1.908023,0.286971,-0.716411,0.398629,-0.650065,1.414069,-1.267832,0.986136,-0.77198


### Machine learning Trainging random forest

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay,precision_score,recall_score, f1_score, roc_auc_score

In [48]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Training set performance
    model_train_acc= accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_rocauc = roc_auc_score(y_train, y_train_pred)

    #Test set performance
    model_test_acc= accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_rocauc = roc_auc_score(y_test, y_test_pred)

    print(list(models.keys())[i])

    print("Model performance for Training set")
    print("- Accuracy: {:.4f}".format(model_train_acc))
    print("- F1 score: {:.4f}".format(model_train_f1))
    print("- Precision score: {:.4f}".format(model_train_precision))
    print("- Recall score: {:.4f}".format(model_train_recall))
    print("- ROCAUC score: {:.4f}".format(model_train_rocauc))

    print("-----------------------------------------------------")

    print("Model performance for Test set")
    print("- Accuracy: {:.4f}".format(model_test_acc))
    print("- F1 score: {:.4f}".format(model_test_f1))
    print("- Precision score: {:.4f}".format(model_test_precision))
    print("- Recall score: {:.4f}".format(model_test_recall))
    print("- ROCAUC score: {:.4f}".format(model_test_rocauc))
    

Logistic Regression
Model performance for Training set
- Accuracy: 0.8511
- F1 score: 0.8277
- Precision score: 0.7270
- Recall score: 0.3324
- ROCAUC score: 0.6517
-----------------------------------------------------
Model performance for Test set
- Accuracy: 0.8314
- F1 score: 0.8003
- Precision score: 0.6374
- Recall score: 0.2511
- ROCAUC score: 0.6089
Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision score: 1.0000
- Recall score: 1.0000
- ROCAUC score: 1.0000
-----------------------------------------------------
Model performance for Test set
- Accuracy: 0.9051
- F1 score: 0.9046
- Precision score: 0.7556
- Recall score: 0.7359
- ROCAUC score: 0.8402
Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision score: 1.0000
- Recall score: 1.0000
- ROCAUC score: 1.0000
-----------------------------------------------------
Model performance for Test set
- Accuracy: 0.9231
- F1 score: 0.9163
- Pr

In [49]:
#Hyperparameter tuning with random forest
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split":[2,8,15,20],
             "n_estimators" : [100,200,500,1000]
             }

In [50]:
randomcv_models = [
    ("RF", RandomForestClassifier(), rf_params)
]

In [52]:
from sklearn.model_selection import RandomizedSearchCV

model_params = {}
for name, model, param in randomcv_models:
    random = RandomizedSearchCV(estimator=model, param_distributions=param, n_iter=100, cv = 3, verbose=2,n_jobs=-1)
    random.fit(X_train, y_train)
    model_params[name] = random.best_params_

for model_name in model_params:
    print(f"------------------------Best params for{model_name}------------------------------")
    print(model_params[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------------------Best params forRF------------------------------
{'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}


In [None]:
models = {
    
    "Random Forest":RandomForestClassifier(n_estimators=1000, min_samples_split=2, max_features=8, max_depth=15)
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Training set performance
    model_train_acc= accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_rocauc = roc_auc_score(y_train, y_train_pred)

    #Test set performance
    model_test_acc= accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_rocauc = roc_auc_score(y_test, y_test_pred)

    print(list(models.keys())[i])

    print("Model performance for Training set")
    print("- Accuracy: {:.4f}".format(model_train_acc))
    print("- F1 score: {:.4f}".format(model_train_f1))
    print("- Precision score: {:.4f}".format(model_train_precision))
    print("- Recall score: {:.4f}".format(model_train_recall))
    print("- ROCAUC score: {:.4f}".format(model_train_rocauc))

    print("-----------------------------------------------------")

    print("Model performance for Test set")
    print("- Accuracy: {:.4f}".format(model_test_acc))
    print("- F1 score: {:.4f}".format(model_test_f1))
    print("- Precision score: {:.4f}".format(model_test_precision))
    print("- Recall score: {:.4f}".format(model_test_recall))
    print("- ROCAUC score: {:.4f}".format(model_test_rocauc))
    

Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision score: 1.0000
- Recall score: 1.0000
- ROCAUC score: 1.0000
-----------------------------------------------------
Model performance for Test set
- Accuracy: 0.9272
- F1 score: 0.9214
- Precision score: 0.9551
- Recall score: 0.6450
- ROCAUC score: 0.8190


In [54]:
# Plot kr skte ho per mera mannn nhi hai

from sklearn.metrics import roc_auc_score, roc_curve
plt.figure()

auc_models = [
    {
        'label': 'Random Forest Classifier',
        'model' : RandomForestClassifier(n_estimators=1000, min_samples_split=2, max_features=8, max_depth=15),
        'auc' : 0.8228
    },
]
for algo in auc_models:
    model = algo['model']
    model.fit(X_train, y_train)

    fpr, tpr, threshhold = roc_curve(y_test,model.predict_proba(X_test)[:,1])


<Figure size 640x480 with 0 Axes>