In [1]:
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer

In [2]:
df=pd.read_csv("/content/data_sample.csv")
df.shape

(3079, 294)

In [3]:
df['target'] = np.where(df['ResponseRate'] == 0, 'B1',
                        np.where((df['ResponseRate'] > 0) & (df['ResponseRate'] < 0.15), 'B2',
                                 np.where(df['ResponseRate'] >= 0.15, 'B3', 'Not Specified')))

In [4]:
# List the columns to remove
columns_to_remove = ['OfferHistoryID', 'ResponseRate']

# Drop the specified columns from the DataFrame
df.drop(columns=columns_to_remove, inplace=True)

In [5]:
df.shape

(3079, 293)

In [6]:
df.isnull().sum()

s_239_7                         1
s_239_8                         1
s_239_9                         1
s_239_10                        1
s_241_11                        1
                               ..
ot_108~Subject: Undetermined    1
ot_108~Tags                     1
ot_108~VOUCHER                  1
ot_108~race                     1
target                          0
Length: 293, dtype: int64

In [7]:
df.shape

(3079, 293)

In [8]:
df.isnull().sum().sum()

3804

In [9]:
df.notnull().sum()

s_239_7                         3078
s_239_8                         3078
s_239_9                         3078
s_239_10                        3078
s_241_11                        3078
                                ... 
ot_108~Subject: Undetermined    3078
ot_108~Tags                     3078
ot_108~VOUCHER                  3078
ot_108~race                     3078
target                          3079
Length: 293, dtype: int64

In [12]:
df=df.fillna(0)

In [13]:
df.shape

(3079, 293)

In [14]:
# Splitting the dataset into X and y
X = df.drop('target', axis=1)  # Features: all columns except 'target'
y = df['target']

In [15]:
# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# DecisionTreeClassifier with desired hyperparameters
decision_tree_model = DecisionTreeClassifier(max_depth=4, random_state=42)

# Fit the decision tree model
decision_tree_model.fit(X_train, y_train)

In [17]:
# Evaluating the model
accuracy = decision_tree_model.score(X_test, y_test)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.9204545454545454


In [18]:
y_pred_test = decision_tree_model.predict(X_test)
y_pred_train = decision_tree_model.predict(X_train)

In [19]:
def evaluate_metrics(df):
    results = {'Label': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

    for label in df['label'].unique():
        true_values = df['label'] == label
        predicted_values = df['pred'] == label

        accuracy = accuracy_score(true_values, predicted_values)
        report = classification_report(true_values, predicted_values, output_dict=True)['True']


        results['Label'].append(label)
        results['Accuracy'].append(accuracy)
        results['Precision'].append(report['precision'])
        results['Recall'].append(report['recall'])
        results['F1 Score'].append(report['f1-score'])
#         results = results.sort_values(['Label'].astype(int))

    # Overall metrics
    overall_accuracy = accuracy_score(df['label'], df['pred'])
    overall_report = classification_report(df['label'], df['pred'], output_dict=True)['weighted avg']

    results['Label'].append(999)
    results['Accuracy'].append(overall_accuracy)
    results['Precision'].append(overall_report['precision'])
    results['Recall'].append(overall_report['recall'])
    results['F1 Score'].append(overall_report['f1-score'])

    kk = pd.DataFrame(results)
#     kk['Label'] = kk['Label'].astype(int)
#     kk = kk.sort_values(['Label'])
    kk['Label'] = kk['Label'].replace({999:'Overall'})
    return kk

test


In [20]:
# Combine the two lists into a DataFrame
data = list(zip(list(y_test), y_pred_test.flatten().tolist()))
df = pd.DataFrame({
    'label': y_test,
    'pred': y_pred_test.flatten()  # This ensures y_pred_test is 1-dimensional
})

In [21]:
evaluate_metrics(df)

Unnamed: 0,Label,Accuracy,Precision,Recall,F1 Score
0,B3,0.920455,0.874286,0.85,0.861972
1,B1,0.957792,0.925287,1.0,0.961194
2,B2,0.962662,0.989247,0.807018,0.888889
3,Overall,0.920455,0.922221,0.920455,0.918819


train


In [22]:
# Combine the two lists into a DataFrame
data = list(zip(list(y_train), y_pred_train.flatten().tolist()))
df = pd.DataFrame({
    'label': y_train,
    'pred': y_pred_train.flatten()  # This ensures y_pred_test is 1-dimensional
})

In [23]:
evaluate_metrics(df)

Unnamed: 0,Label,Accuracy,Precision,Recall,F1 Score
0,B1,0.966301,0.940544,1.0,0.969361
1,B3,0.935851,0.893455,0.86963,0.881381
2,B2,0.969549,0.987805,0.852632,0.915254
3,Overall,0.935851,0.936754,0.935851,0.934815
