In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("cc_data.csv", encoding = "UTF-8")

In [2]:
from sklearn.model_selection import train_test_split

df_X1 = pd.get_dummies(df[df.columns[df.columns != 'REALTYPE']].copy()) # get columns that are not 'good cx'
df_X = pd.get_dummies(df_X1[df_X1.columns[df_X1.columns != 'ID']].copy())
df_y = df['REALTYPE'].copy() # get the column named 'REALTYPE'; this is our label

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=1)

print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test))

Number of training instances:  28884 
Number of test instances:  7221


**Normal DT**

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, recall_score, precision_score

# Fix the random seed for decision tree classifier
np.random.seed(seed=0)   
dt = DecisionTreeClassifier(max_depth=None)
dt_model = dt.fit(X_train,y_train)

# Calculate accuracy of DT
print('Decision Tree accuracy for training set: %f' % dt_model.score(X_train, y_train))
print('Decision Tree accuracy for test set: %f' % dt_model.score(X_test, y_test))

y_pred = dt.predict(X_test)

# Calculate F1 score, recall score and precision score
f = f1_score(y_test , y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred)
precision = precision = precision_score(y_test, y_pred)
print('F1 score: %f' %f)
print('recall score: %f' %recall)
print('precision score: %f' %precision)
print('AUC-ROC score: %f' %roc_auc_score(y_test, y_pred))

Decision Tree accuracy for training set: 1.000000
Decision Tree accuracy for test set: 0.988090
f1 score: 0.987985
recall score: 0.228070
precision score: 0.236364
AUC-ROC score: 0.611104


In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score, roc_curve
y_pred = dt.predict(X_test)
f = f1_score(y_true = y_test , y_pred = y_pred,average = 'weighted')
print(f)
# interpretation:
# Because the data is imbalanced.

0.9879847866296657


#### For normal decision tree, we get accuracy 1 for train set and accuracy 0.988 for test set which are very good performance. However, becasue the data is imbalanced, the accuracy is not accurate enough. Therefore, we also calculate the recall score, precision score and F1 score, get 0.228, 0.236 and 0.988 respectively, and figure out that the number of false nagative and false positive instances are great. Besides, we get 0.611 for AUC-ROC score, which can aggregate measure the predictions is 61.1% correct.  


**Near Miss Undersampling**

In [9]:
import imblearn
from imblearn.under_sampling import NearMiss 
undersample = NearMiss()
# transform the dataset

X_nm, y_nm = undersample.fit_resample(df_X, df_y)

In [8]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

X_train, X_test, y_train, y_test = train_test_split(X_nm, y_nm, test_size=0.2, random_state=1)

print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test))


np.random.seed(seed=0)   
dt = DecisionTreeClassifier(max_depth=None)
dt_model = dt.fit(X_train,y_train)

# Calculate accuracy
print('Decision Tree accuracy for training set: %f' % dt_model.score(X_train, y_train))
print('Decision Tree accuracy for test set: %f' % dt_model.score(X_test, y_test))
y_pred = dt.predict(X_test)

# Calculate F1 score, recall score and precision score
f = f1_score(y_true = y_test , y_pred = y_pred,average = 'weighted')
recall = recall_score(y_test, y_pred)
precision = precision = precision_score(y_test, y_pred)
print('F1 score: %f' %f)
print('recall score: %f' %recall)
print('precision score: %f' %precision)
print('AUC-ROC score: %f' %roc_auc_score(y_test, y_pred))

Number of training instances:  422 
Number of test instances:  106
Decision Tree accuracy for training set: 1.000000
Decision Tree accuracy for test set: 0.688679
f1 score: 0.689095
recall score: 0.606557
precision score: 0.804348
AUC-ROC score: 0.703279


#### To reduce the effects of imbalance data, we use near miss undersampling, we get accuracy 1 for train set and accuracy 0.688 for test set which is much lower than decision tree result. We also calculate the recall score, precision score and F1 score, get 0.606, 0.804 and 0.689 respectively, and figure out that the number of false nagative and false positive instances are much smaller than normal decision tree. Besides, we get 0.703 for AUC-ROC score, which can aggregate measure the predictions is 70.3% correct. It is also much greater than normal decision tree result.


In [10]:
pd.value_counts(df_y)

0.0    35841
1.0      264
Name: REALTYPE, dtype: int64

In [11]:
pd.value_counts(y_nm)

0.0    264
1.0    264
Name: REALTYPE, dtype: int64

**SMOTE oversampling**

In [16]:
from imblearn.over_sampling import SMOTE
from numpy import mean
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from collections import Counter
from imblearn.over_sampling import SVMSMOTE
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where

counter = Counter(df_y)
print('before', counter)
sm = SMOTE()
X_res, y_res = sm.fit_resample(df_X, df_y)
counter = Counter(y_res)
print('after', counter)


before Counter({0.0: 35841, 1.0: 264})
after Counter({0.0: 35841, 1.0: 35841})


In [17]:
k_values = [1, 2, 3, 4, 5, 6, 7, 44, 60, 99]
for k in k_values:
	# define pipeline
	model = DecisionTreeClassifier()
	over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
	under = RandomUnderSampler(sampling_strategy=0.5)
	steps = [('over', over), ('under', under), ('model', model)]
	pipeline = Pipeline(steps=steps)
	# evaluate pipeline
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(pipeline, df_X, df_y, scoring='roc_auc', cv=cv, n_jobs=-1)

	score = mean(scores)
	print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

print('Decision Tree accuracy for training set: %f' % dt_model.score(X_train, y_train))
print('Decision Tree accuracy for test set: %f' % dt_model.score(X_test, y_test))
y_pred = dt.predict(X_test)
f = f1_score(y_test , y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred)
precision = precision = precision_score(y_test, y_pred)
print('f1 score: %f' %f)
print('recall score: %f' %recall)
print('precision score: %f' %precision)



> k=1, Mean ROC AUC: 0.673
> k=2, Mean ROC AUC: 0.664
> k=3, Mean ROC AUC: 0.666
> k=4, Mean ROC AUC: 0.671
> k=5, Mean ROC AUC: 0.670
> k=6, Mean ROC AUC: 0.660
> k=7, Mean ROC AUC: 0.669
> k=44, Mean ROC AUC: 0.671
> k=60, Mean ROC AUC: 0.670
> k=99, Mean ROC AUC: 0.665
Decision Tree accuracy for training set: 1.000000
Decision Tree accuracy for test set: 0.688679
f1 score: 0.689095
recall score: 0.606557
precision score: 0.804348


####We use SMOTE oversampling to reduce the effects of imbalance data. we get accuracy 1 for train set and accuracy 0.689 for test set which is much lower than decision tree result. We also calculate the recall score, precision score and F1 score, get 0.607, 0.804 and 0.689 respectively, and figure out that the number of false nagative and false positive instances are much smaller than normal decision tree. Besides, by tring different K value for AUC-ROC score, AUC-ROC score are less than 0.68 which measure the predictions is less than 68% correct. But it is still much greater than normal decision tree result.

##Using Validation


In [18]:
X_train, X_rem, y_train, y_rem = train_test_split(df_X, df_y, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(28884, 53)
(28884,)
(3610, 53)
(3610,)
(3611, 53)
(3611,)


(None, None)

###DT

In [19]:
from pandas._libs.lib import fast_unique_multiple_list_gen

np.random.seed(seed=3244)   
dt = DecisionTreeClassifier()
dt_model = dt.fit(X_train,y_train)

print('Decision Tree accuracy for training set: %f' % dt_model.score(X_train, y_train))
print('Decision Tree accuracy for validation set: %f' % dt_model.score(X_valid, y_valid))
print('Decision Tree accuracy for test set: %f' % dt_model.score(X_test, y_test))

y_pred_valid = dt.predict(X_valid)
y_pred_test = dt.predict(X_test)

f_valid = f1_score(y_valid , y_pred_valid, average = 'weighted')
f_test = f1_score(y_test , y_pred_test, average = 'weighted')
print('f1 score of valid set: %f' %f_valid)
print('f1 score of test set: %f' %f_test)

recall_valid = recall_score(y_valid, y_pred_valid)
recall_test = recall_score(y_test, y_pred_test)
print('recall score of valid set: %f' %recall_valid)
print('recall score of test set: %f' %recall_test)

precision_valid = precision_score(y_valid, y_pred_valid)
precision_test = precision_score(y_test, y_pred_test)
print('precision score of valid set: %f' %precision_valid)
print('precision score of test set: %f' %precision_test)

rocauc_valid = roc_auc_score(y_valid, y_pred_valid)
rocauc_test = roc_auc_score(y_test, y_pred_test)
print('AUC-ROC score of valid set: %f' %rocauc_valid)
print('AUC-ROC score of test set: %f' %rocauc_test)

Decision Tree accuracy for training set: 1.000000
Decision Tree accuracy for validation set: 0.988366
Decision Tree accuracy for test set: 0.987538
f1 score of valid set: 0.987904
f1 score of test set: 0.988028
recall score of valid set: 0.148148
recall score of test set: 0.310345
precision score of valid set: 0.173913
precision score of test set: 0.264706
AUC-ROC score of valid set: 0.571423
AUC-ROC score of test set: 0.651683


####For normal decision tree, compared to not use validation, the accuracy for test set is slightly lower, but F1 score, recall score and precison score are much greater. Besides, AUC-ROC score of test set increases to 0.65 which is around 4% improving.

###Near Miss undersampling

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_nm, y_nm, test_size=0.2, random_state=1)

X_train, X_rem, y_train, y_rem = train_test_split(X_nm, y_nm, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)


(422, 53)
(422,)
(53, 53)
(53,)
(53, 53)
(53,)


(None, None)

In [21]:
np.random.seed(seed=3244)   
dt = DecisionTreeClassifier()
dt_model = dt.fit(X_train,y_train)

print('Decision Tree accuracy for training set: %f' % dt_model.score(X_train, y_train))
print('Decision Tree accuracy for validation set: %f' % dt_model.score(X_valid, y_valid))
print('Decision Tree accuracy for test set: %f' % dt_model.score(X_test, y_test))

y_pred_valid = dt.predict(X_valid)
y_pred_test = dt.predict(X_test)

f_valid = f1_score(y_valid , y_pred_valid, average = 'weighted')
f_test = f1_score(y_test , y_pred_test, average = 'weighted')
print('f1 score of valid set: %f' %f_valid)
print('f1 score of test set: %f' %f_test)

recall_valid = recall_score(y_valid, y_pred_valid)
recall_test = recall_score(y_test, y_pred_test)
print('recall score of valid set: %f' %recall_valid)
print('recall score of test set: %f' %recall_test)

precision_valid = precision_score(y_valid, y_pred_valid)
precision_test = precision_score(y_test, y_pred_test)
print('precision score of valid set: %f' %precision_valid)
print('precision score of test set: %f' %precision_test)

rocauc_valid = roc_auc_score(y_valid, y_pred_valid)
rocauc_test = roc_auc_score(y_test, y_pred_test)
print('AUC-ROC score of valid set: %f' %rocauc_valid)
print('AUC-ROC score of test set: %f' %rocauc_test)

Decision Tree accuracy for training set: 1.000000
Decision Tree accuracy for validation set: 0.773585
Decision Tree accuracy for test set: 0.641509
f1 score of valid set: 0.776025
f1 score of test set: 0.645479
recall score of valid set: 0.718750
recall score of test set: 0.600000
precision score of valid set: 0.884615
precision score of test set: 0.521739
AUC-ROC score of valid set: 0.787946
AUC-ROC score of test set: 0.633333


####For DT with near miss undersampling, compared to not use validation, the accuracy, F1 score, recall score, precision score and AUC-ROC score of test set are all slightly lower. 

###SMOTE oversampling

In [22]:
counter = Counter(df_y)
print('before', counter)
sm = SMOTE(random_state=3244)
X_res, y_res = sm.fit_resample(df_X, df_y)
counter = Counter(y_res)
print('after', counter)

X_train, X_rem, y_train, y_rem = train_test_split(X_res, y_res, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

np.random.seed(seed=3244)   
dt = DecisionTreeClassifier()
dt_model = dt.fit(X_train, y_train)



before Counter({0.0: 35841, 1.0: 264})
after Counter({0.0: 35841, 1.0: 35841})
(57345, 53)
(57345,)
(7168, 53)
(7168,)
(7169, 53)
(7169,)
