# Model Development without Hyper Parameter Tuning

In [None]:
# Import Package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data
data = pd.read_csv("preprocessd_data.csv")
data

# Step-2 Devide data into X and y
data.shape
data.columns
  # Churn is the target column
X = data.drop("churn",axis=1)
y = data["churn"]
X.shape,y.shape

# Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Read and devlop the model

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

# Visualize the model

from sklearn.tree import DecisionTreeClassifier,plot_tree
plt.figure(figsize=(10,6))
plot_tree(dt,feature_names=X.columns,filled=True,rounded=True)
plt.show()

# Prediction on test

y_pred =dt.predict(X_test)
y_pred

# Compare with y_test
print(y_test.values[:10])
print(y_pred[:10])

y_test

# Create a Predictions Dataframe

pred_df = pd.DataFrame()
pred_df["Ground_Truth_data"] = y_test
pred_df["Model_predictions"] = y_pred
l = [1 if i==j else 0 for i,j in zip(y_test,y_pred)]
pred_df["Match_NotMatch"]=l
accuracy = np.sum(pred_df["Match_NotMatch"])/len(pred_df)
accuracy

pred_df

# Metrics

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,\
                            confusion_matrix,ConfusionMatrixDisplay,roc_auc_score,auc

# Confusion Metrix Display

confusion_matrix(y_test,y_pred)

tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
print("true positives are:",tp)
print("true negatives are:",tn)
print("false positive are:",fp)
print("false negative are:",fn)

cmt = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(cmt,display_labels=["No","Yes"])
disp.plot()

# Precision-Recall-F1Score- accuracy

pr = round(precision_score(y_test,y_pred),2)
rc = round(recall_score(y_test,y_pred),2)
acc = round(accuracy_score(y_test,y_pred),2)
f1 = round(f1_score(y_test,y_pred),2)
print("The precision score is :",pr)
print("The recall score is:",rc)
print("The accuracy is:",acc)
print("The f1 score is:",f1)

dt_metrics = [pr,rc,acc,f1]
metrics_df = pd.DataFrame(dt_metrics,index = ["Precision","Recall","Accuracy","F1 Score"],columns=["Decision Tree"])
metrics_df

prob_yes = dt.predict_proba(X_test)[:,1]
prob_yes

# ROC Curve
y_true = y_test
prob_yes = dt.predict_proba(X_test)[:,1]
prob_yes

from sklearn.metrics import roc_curve
fpr,tpr,threshold = roc_curve(y_true,prob_yes)
print("fpr:",fpr)
print("tpr:",tpr)
plt.plot([0,1],[0,1],color="navy",lw=2,label="Random-Model")
plt.plot(fpr,tpr,color = "darkorange",lw=2,label="Decision Tree Model")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristics: ROC-AUC")
plt.legend()
plt.show()

# AUC
auc_score = auc(fpr,tpr)
print("The Auc Score is:",auc_score)





# Model development using Hyper parameter tunning _ Decision Tree

In [None]:
# Import data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,ConfusionMatrixDisplay,classification_report,roc_auc_score,roc_curve,auc

# Read the data
data = pd.read_csv("preprocessd_data.csv")
data

# Step-2 Devide data into X and y
data.shape
data.columns
  # Churn is the target column
X = data.drop("churn",axis=1)
y = data["churn"]
X.shape,y.shape

# Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Read the base model
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.tree import DecisionTreeClassifier
gride_tree = DecisionTreeClassifier()
gride_tree

# Create a parameter file
grid_tree.get_params()

# You need to create dictionary with hyper parameter
param_grid = {
    "criterion": ["gini", "entropy"],   # Splitting criteria for the tree
    "max_depth": [3, 4, 5, 6, 7, 8],    # Maximum depth of the tree
    "min_samples_split": [2, 3, 4],     # Minimum samples needed to split a node
    "min_samples_leaf": [1, 2, 3, 4],   # Minimum samples needed to form a leaf
    "random_state": [0, 42]             # Seed for reproducibility
}

# Step-7 Apply Gride Search CV

grid_search = GridSearchCV(grid_tree,
                           param_grid,
                           scoring="accuracy",
                           cv=5,
                           verbose=True)
grid_search

dir(grid_search)

# Step-8 fit the model with train data

grid_search.fit(X_train,y_train)

# Step-9: Get the best parameters

best_params=grid_search.best_params_
best_score=grid_search.best_score_
print("best params:",best_params)
print("best train accuracy:",best_score)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(criterion='entropy',
                            max_depth=6,
                            min_samples_leaf=3,
                            min_samples_split=2,
                            random_state=0)
dtree.fit(X_train, y_train)

# =============================Step-5:  Predictions============================================================

y_pred_dt=dtree.predict(X_test)

# ============================ Step-6: Metrics==================================================================

acc_dt= round(accuracy_score(y_test,y_pred_dt)*100,2)
f1_dt=round(f1_score(y_test,y_pred_dt),2)
precision_dt=round(precision_score(y_test,y_pred_dt),2)
recall_dt=round(recall_score(y_test,y_pred_dt),2)

print("accuray is:",acc_dt)
print("F1 is:",f1_dt)
print("Precision is:",precision_dt)
print("Recall is:",recall_dt)
print(classification_report(y_test,y_pred_dt))

# ================================Step-7:Confusion matrix=========================================================================

cmt=confusion_matrix(y_test,y_pred_dt)

disp=ConfusionMatrixDisplay(confusion_matrix=cmt,
                            display_labels = [False, True])
disp.plot()
plt.grid(False)
plt.show()


tn, fp, fn, tp = confusion_matrix(y_test,y_pred_dt).ravel()
print("True negative:",tn)
print("False postive:",fp)
print("False negative:",fn)
print("True postive:",tp)

#=======================================Step-8: ROC-AUC curve================================================================

y_dt_pred_prob=dtree.predict_proba(X_test)[:,1]   # Class-1 probabilities
fpr,tpr,threshold=roc_curve(y_test,y_dt_pred_prob) 
plt.plot([0,1],[0,1],color="navy",lw=2,label="Random-Model")
plt.plot(fpr,tpr,color="darkorange",lw=2, label="Decision-Tree Model")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic :ROC-AUC")
plt.legend()
plt.show()
print("Computed Area Under the Curve (AUC)",auc(fpr, tpr))

In [None]:
# Decision tree will provide imprtant features also
# Information gain values 
dtree.feature_importances_

In [None]:
imp_df = pd.DataFrame({
    "Feature Name": X_train.columns,
    "Importance": dtree.feature_importances_
})
fi = imp_df.sort_values(by="Importance", ascending=False)
fi

# This concept is valid only for DT
# you can use these features top 10 again develop the model
# Dont use these feature in another model