In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
## Import for partitioning the data
from sklearn.model_selection import train_test_split
## Classifier Imports
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
# Imports for showing the data
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn import tree
# Import for saving the model
from joblib import dump, load


### Reading in the data##################################################################################
dat = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv")
#########################################################################################################

### Encoding the data ###################################################################################
### Assigning number values to all strings and removing 'contact' and 'day_of_week'
## Seperating numerical and categorical so numerical does not get encoded
dat_cat = dat.drop(["age", "campaign", "pdays",	"previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", 'contact', 'day_of_week'], axis = 1)
dat_num = dat.drop(["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome", "y"], axis = 1)

# Rejoining numerical and categorical
dat_encoded = dat_cat.apply(preprocessing.LabelEncoder().fit_transform)
dat_encoded = pd.concat([dat_num, dat_encoded], axis=1)
##########################################################################################################

#####Creating the High and Low Economy####################################################################
dat_high = dat_encoded[ dat_encoded["cons.conf.idx"] >= -40]
dat_low = dat_encoded[ dat_encoded["cons.conf.idx"] < -40]
##########################################################################################################

### Creating the features and the target##################################################################
X = dat_encoded.drop('y', axis=1)
y = dat_encoded['y']

X_high = dat_high.drop('y', axis=1)
y_high = dat_high['y']

X_low = dat_low.drop('y', axis=1)
y_low = dat_low['y']

## Partitioning the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X, y, test_size=0.20)
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X, y, test_size=0.20)
###########################################################################################################

####OVERSAMPLING###########################################################################################
from imblearn.over_sampling import RandomOverSampler
# https://imbalanced-learn.readthedocs.io/en/stable/user_guide.html


# Let's over sample the minority class, which samples with replacement until the
# majority (died) and the minority (survived) are equal
ro = RandomOverSampler()

# Oversample, note that we oversample X and y at the same time in order to 
# make sure our features and targets stay synched.
X_train_new, y_train_new = ro.fit_resample(X_train, y_train)
X_test_new, y_test_new = ro.fit_resample(X_test, y_test)

X_train_new_high, y_train_new_high = ro.fit_resample(X_train_high, y_train_high)
X_test_new_high, y_test_new_high = ro.fit_resample(X_test_high, y_test_high)

X_train_new_low, y_train_new_low = ro.fit_resample(X_train_low, y_train_low)
X_test_new_low, y_test_new_low = ro.fit_resample(X_test_low, y_test_low)
############################################################################################################

#### Three different types of classifiers that were tried###################################################

### OG Classifier
### Was nice, but only can do so much
#classifier = DecisionTreeClassifier()
#classifier.fit(X_train_new, y_train_new)


### Forest Classifier
### A forest of decision trees.
### Best precision of the three at ~0.86, but recall was low at ~0.38
#clf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
#clf = clf.fit(X_train_new, y_train_new)

##Bagging Classifier
# Personal Favorite and makes the most sense to use because of how deep our decision tree is.
# Best Recall at ~0.64 and precision at ~0.80
clf = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.25)
clf = clf.fit(X_train_new, y_train_new)

clf_high = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.25)
clf_high = clf.fit(X_train_new_high, y_train_new_high)

clf_low = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.25)
clf_low = clf_low.fit(X_train_new_low, y_train_new_low)
#############################################################################################################

####Saving the Model#########################################################################################
dump(clf, 'teddyjustrightModel.joblib')
dump(clf_high, 'teddy2highModel.joblib')
dump(clf_low, 'teddy2lowModel.joblib') 
#############################################################################################################

##PREDICITON TIME############################################################################################
y_pred = clf.predict(X_test_new)

y_pred_high = clf_high.predict(X_test_new_high)

y_pred_low = clf_low.predict(X_test_new_low)

## Displaying the data


print(confusion_matrix(y_test_new, y_pred))
print(classification_report(y_test_new, y_pred))

print(confusion_matrix(y_test_new_high, y_pred_high))
print(classification_report(y_test_new_high, y_pred_high))

print(confusion_matrix(y_test_new_low, y_pred_low))
print(classification_report(y_test_new_low, y_pred_low))



### Showing the tree, but it takes a hot minute cause the tree is so big
#fig, ax = plt.subplots(figsize=(20, 20))
#tree.plot_tree(classifier, fontsize=10, feature_names=X.columns)
#plt.show()
##############################################################################################################

ModuleNotFoundError: No module named 'sklearn'

In [None]:
import seaborn as sns
import pandas as pd

banco = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

for x in list(banco):
  banco = banco[banco[x] != "unknown"]

for x in list(banco.columns):
  banco[x] = pd.factorize(banco[x])[0]


ax = sns.heatmap(banco, annot=True, fmt="d")