In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.metrics import confusion_matrix
from IPython.display import Image
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import export_text
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn import metrics

In [None]:
df = pd.read_csv("Cleaned_data_Label_Encoding_version_final.csv")

In [None]:
## Normalize
y = df["Price"]
X = df.drop(columns=["Price"])
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=240)
def create_cm(t1, t2):
    cm = confusion_matrix(t1, t2)
    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
## Oversampling Minority
df["Price"].value_counts()

In [None]:
#combine our training info back together for now
X =X_train
X['Price'] = y_train

# separate minority and majority classes
not_churn = X[X['Price']==1] #majority
churn = X[X['Price']==0] #minority
churn_2 = X[X['Price']==2] #minority

print(not_churn.shape)
print(churn.shape)
print(churn_2.shape)

In [None]:
# upsample minority
churn_upsampled = resample(churn, replace=True, # sample with replacement
                           n_samples=len(not_churn), # match number in majority class
                           random_state=27) # reproducible results

churn_upsampled_2 = resample(churn_2, replace=True, # sample with replacement
                           n_samples=len(not_churn), # match number in majority class
                           random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_churn, churn_upsampled, churn_upsampled_2])

# check new class counts
upsampled['Price'].value_counts()

# split our X and y back out
y_train_over = upsampled['Price']
X_train_over = upsampled.drop('Price', axis=1)

In [None]:
## Undersampling Majority
X =X_train
X['Price'] = y_train

not_churn = X[X['Price']==1]
churn = X[X['Price']==0]
churn_2 = X[X['Price']==2]
print(not_churn.shape)
print(churn.shape)
print(len(not_churn))

In [None]:
not_churn_downsampled = resample(not_churn,
                                replace = False, # sample without replacement
                                #n_samples = len(churn), # match minority n
                                n_samples = len(churn), # create 2:1 ratio
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_churn_downsampled,churn_2, churn])

# checking counts
downsampled['Price'].value_counts()

In [None]:
y_train_under = downsampled['Price']
X_train_under = downsampled.drop('Price', axis=1)

In [None]:
## Decision Tree
# DT - no imbalance process
X_train = X_train.drop(columns=['Price'])

dt = DecisionTreeClassifier(criterion = "entropy",random_state = 242)
dt.fit(X_train, y_train)
print(dt.tree_.max_depth)
print(dt.tree_.n_leaves)
dt_pred = dt.predict(X_test)


print(pd.crosstab(dt_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,dt_pred))

In [None]:
dt = DecisionTreeClassifier(criterion = "gini",random_state = 242)
dt.fit(X_train, y_train)
print(dt.tree_.max_depth)
print(dt.tree_.n_leaves)
dt_pred = dt.predict(X_test)

print(dt.score(X_test, y_test))
pd.crosstab(dt_pred, y_test, rownames = ["Predicted"], colnames = ["Actual"])

print(pd.crosstab(dt_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,dt_pred))

In [None]:
path = dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()

In [None]:
dt.fit(X_train, y_train)
print(dt.tree_.max_depth)
print(dt.tree_.n_leaves)
dt_pred = dt.predict(X_test)

print(dt.score(X_test, y_test))
pd.crosstab(dt_pred, y_test, rownames = ["Predicted"], colnames = ["Actual"])

print(pd.crosstab(dt_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,dt_pred))

In [None]:
X = X.drop(columns=['Price'])

text_tree = export_text(dt, feature_names = list(X.columns))
print(text_tree)

In [None]:
fig = plt.figure(figsize=(250,200))
tree.plot_tree(dt, feature_names = X_train.columns,  filled=True)
plt.show()

In [None]:
## DT - Oversampling
dt.fit(X_train_over, y_train_over)
print(dt.tree_.max_depth)
print(dt.tree_.n_leaves)
dt_pred = dt.predict(X_test)

print(dt.score(X_test, y_test))

print(pd.crosstab(dt_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,dt_pred))

In [None]:
path = dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()

In [None]:
## DT - Undersampling
dt.fit(X_train_under, y_train_under)
print(dt.tree_.max_depth)
print(dt.tree_.n_leaves)
dt_pred = dt.predict(X_test)

print(dt.score(X_test, y_test))
print(pd.crosstab(dt_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,dt_pred))

In [None]:
path = dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()

In [None]:
## Random Forest
### RF - No Imbalance Process
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print(rf.score(X_test, y_test))
print(pd.crosstab(rf_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,rf_pred))

In [None]:
influence = pd.Series(rf.feature_importances_, index = X.columns)
influence.sort_values(inplace = True, ascending = False)
print(influence[0:19])

In [None]:
## RF - oversampling
rf.fit(X_train_over, y_train_over)
rf_pred = rf.predict(X_test)

print(rf.score(X_test, y_test))
print(pd.crosstab(rf_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,rf_pred))

In [None]:
X = X.drop(columns=['Price'])
influence = pd.Series(rf.feature_importances_, index = X.columns)
influence.sort_values(inplace = True, ascending = False)
print(influence[0:19])

In [None]:
## RF - Undersampling
rf.fit(X_train_under, y_train_under)
rf_pred = rf.predict(X_test)

print(rf.score(X_test, y_test))
print(pd.crosstab(rf_pred,y_test,rownames = ["Predicted"], colnames = ["Actual"]))
print(metrics.classification_report(y_test,rf_pred))

In [None]:
influence = pd.Series(rf.feature_importances_, index = X.columns)
influence.sort_values(inplace = True, ascending = False)
print(influence[0:19])