In [None]:
# importing libraries
import pandas            as pd                          # data science essentials
import matplotlib.pyplot as plt                         # data visualization
import seaborn           as sns                         # enhanced data viz
from sklearn.model_selection import train_test_split    # train-test split
from sklearn.linear_model import LogisticRegression     # logistic regression
import statsmodels.formula.api as smf                   # logistic regression
from sklearn.metrics import confusion_matrix            # confusion matrix
from sklearn.metrics import roc_auc_score               # auc score
from sklearn.neighbors import KNeighborsClassifier      # KNN for classification
from sklearn.neighbors import KNeighborsRegressor       # KNN for regression
from sklearn.preprocessing import StandardScaler        # standard scaler
from sklearn.tree import DecisionTreeClassifier         # classification trees
from sklearn.tree import plot_tree                      # tree plots
from sklearn.model_selection import RandomizedSearchCV  # hyperparameter tuning
from sklearn.metrics import make_scorer                 # customizable scorer
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm


########################################
# loading data and setting display options
########################################
# loading data
GOT = pd.read_excel('./GOT_character_predictions.xlsx')

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

# displaying the head of the dataset
GOT.head(n = 5)

In [None]:
#checking missing value
GOT.info()

In [None]:
# checking the population of specific family/title/culture 
#GOT['house'].value_counts(normalize=False, sort=True, ascending=False)
#GOT['title'].value_counts(normalize=False, sort=True, ascending=False)
#GOT['culture'].value_counts(normalize=False, sort=True, ascending=False)

In [None]:
# find the correlation bwtween each variables
GOT_corr = GOT.corr(method = 'pearson')
GOT_corr.loc[ : , 'isAlive'].round(decimals = 2).sort_values(ascending = False)

In [None]:
#set up dummy variables for the 'house'
GOT = pd.get_dummies(data = GOT,
                     columns = ['house'],
                     prefix = "dm",
                     drop_first = True)

GOT

In [None]:
# declaring explanatory variables
GOT_data= GOT.drop('isAlive', axis=1)

# declaring response variable
GOT_target= GOT.loc[:,'isAlive']

# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
                                   GOT_data,
                                   GOT_target,
                                   test_size    = 0.1,
                                   random_state = 219,
                                   stratify     = GOT_target) # preserving balance

# merging training data for statsmodels
GOT_train = pd.concat([x_train, y_train], axis = 1)

In [None]:
# instantiating a logistic regression model object
logit_full = smf.logit(formula   = """isAlive ~ book4_A_Feast_For_Crows + 
                                                numDeadRelations + 
                                                popularity +
                                                book1_A_Game_Of_Thrones""", 
                                                data = GOT_train)


# FITTING the model object
results_full = logit_full.fit()


# checking the results SUMMARY
results_full.summary2()

In [None]:
# explanatory sets from last session

# creating a dictionary to store candidate models

candidate_dict = {

 # full model
 "logit_full"   : ["book4_A_Feast_For_Crows", "numDeadRelations", "popularity", "book1_A_Game_Of_Thrones",
                   "S.No", "dm_Night's Watch", "dm_House Targaryen"],}

In [None]:
####Logistic Regression####

# train/test split with the full model
GOT_data   =  GOT.loc[ : , candidate_dict['logit_full']]
GOT_target =  GOT.loc[ : , 'isAlive']

# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
                                   GOT_data,
                                   GOT_target,
                                   test_size    = 0.1,
                                   random_state = 219,
                                   stratify     = GOT_target) # preserving balance


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            warm_start = False,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))
print('Logistic AUC Score:', roc_auc_score(y_true  = y_test,
                                            y_score = logreg_pred).round(4))

# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4) # accuracy
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")

In [None]:
####KNN Classification Model####

# train-test split with the scaled data
x_train, x_test, y_train, y_test = train_test_split(
            GOT_data,
            GOT_target,
            random_state = 219,
            test_size    = 0.1,
            stratify     = GOT_target)

#INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier()


# FITTING the training data
knn_fit = knn_opt.fit(x_train, y_train)


# PREDICTING based on the testing set
knn_pred = knn_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', knn_fit.score(x_test, y_test).round(4))
print('KNN AUC Score    :', roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4))


# saving scoring data
knn_train_score = knn_fit.score(x_train, y_train).round(4)
knn_test_score  = knn_fit.score(x_test, y_test).round(4)


# saving AUC score
knn_auc_score   = roc_auc_score(y_true  = y_test,
                                y_score = knn_pred).round(4)

# unpacking the confusion matrix
knn_tn, \
knn_fp, \
knn_fn, \
knn_tp = confusion_matrix(y_true = y_test, y_pred = knn_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {knn_tn}
False Positives: {knn_fp}
False Negatives: {knn_fn}
True Positives : {knn_tp}
""")

In [None]:
####Classification Trees####

# INSTANTIATING a classification tree object
tree_pruned = DecisionTreeClassifier(max_depth=8,
                                     min_samples_leaf=25,
                                     random_state = 219)


# FITTING the training data
tree_pruned_fit = tree_pruned.fit(x_train, y_train)


# PREDICTING on new data
tree_pruned_pred = tree_pruned_fit.predict(x_test)


# SCORING the model
print('Training ACCURACY:', tree_pruned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_pruned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_pruned_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = tree_pruned_fit.score(x_train, y_train).round(4) # accuracy
pruned_tree_test_score  = tree_pruned_fit.score(x_test, y_test).round(4) # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = tree_pruned_pred).round(4) # auc

# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_pruned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")

In [None]:
####Random Forest###

# INSTANTIATING a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 500,
                                    criterion        = 'entropy',
                                    max_depth        = 8,
                                    min_samples_leaf = 10,
                                    bootstrap        = True,
                                    warm_start       = False,
                                    random_state     = 219)

# FITTING the training data
rf_default_fit = rf_default.fit(x_train, y_train)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', rf_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                        y_score = rf_default_fit_pred).round(4))

# saving AUC score
rf_train_acc = rf_default_fit.score(x_train, y_train).round(4)
rf_test_acc  = rf_default_fit.score(x_test, y_test).round(4)
rf_auc       = roc_auc_score(y_true  = y_test,
                             y_score = rf_default_fit_pred).round(4)

# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = rf_default_fit_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")

In [None]:
####Gradient Boosted Model####

 
# INSTANTIATING with best_estimator
gbm_tuned = GradientBoostingClassifier(learning_rate = 0.1,
                                       max_depth     = 2,
                                       n_estimators  = 350,
                                       warm_start    = True,
                                       random_state  = 219)


# FITTING to the FULL DATASET (due to cross-validation)
gbm_tuned_fit = gbm_tuned.fit(GOT_data, GOT_target)


# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

# saving AUC score
gbm_train_acc = gbm_tuned_fit.score(x_train, y_train).round(4)
gbm_test_acc  = gbm_tuned_fit.score(x_test, y_test).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test,
                              y_score = gbm_tuned_pred).round(4)

# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")

In [None]:
# creating a dictionary for model results
# appending to model_performance
model_performance ={'Model Name'        : ['Logistic', 'KNN', 'Tree', 'Random Forest', 'GBM'],
                          
                    'Training Accuracy' : [logreg_train_score, knn_train_score, pruned_tree_train_score, rf_train_acc,
                                           gbm_train_acc],
                          
                    'Testing Accuracy'  : [logreg_test_score, knn_test_score, pruned_tree_test_score, rf_test_acc, 
                                           gbm_test_acc],
                          
                    'AUC Score'         : [logreg_auc_score, knn_auc_score, pruned_tree_auc_score, rf_auc, gbm_auc],
                          
                    'Confusion Matrix'  : [(logreg_tn, logreg_fp, logreg_fn, logreg_tp), (knn_tn, knn_fp, knn_fn, knn_tp),
                                           (pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp),
                                           (rf_tn, rf_fp, rf_fn, rf_tp), 
                                           (gbm_tuned_tn, gbm_tuned_fp, gbm_tuned_fn, gbm_tuned_tp)],
                    'Final Model'       : ['','','','','✓']}
# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)
                                                 
# checking the results
model_performance

In [None]:
print(f"""\n\n\n {'*' * 95}""")

#final model output
print(f"""
\tIn this exercise, I used five models which are Logistic, KNN, Tree, Random Forest, and GBM to seek the fittest model\n 
toward the given GOT data, although the train-test gap doesn't have a huge difference that is all around \033[1m"{(gbm_test_acc-gbm_train_acc).round(decimals=2)}"\033[0m.\n
However, the AUC Score has a significant disparity, since the \033[1mGradient Boosted Model\033[0m suit better for the data and has the\n
highest \033[1mAUC Score of {gbm_auc}\033[0m I chose this as my final model. Additionally, sensitivity and specificity remain on the great level\n 
which is \033[1m{(gbm_tuned_tp/(gbm_tuned_tp+gbm_tuned_fn)).round(decimals=2)} & {gbm_tuned_tn/(gbm_tuned_tn+gbm_tuned_fp)}\033[0m respectively, which means that there is only a small likelihood to go to the wrong prediction. Even though\n
it is possible to keep increasing the Accuracy and AUC Score by tuning the hyperparameter option, an extremely positive score\n
appear could be a problem that ought to be aware of such as bias or inappropriate balance. Overall, the performance of\n
\033[1mConfusion Matrix\033[0m in GBM model only shows a few errors \033[1m{(gbm_tuned_tn, gbm_tuned_fp, gbm_tuned_fn, gbm_tuned_tp)}\033[0m which is quite good in our forecasting. """)
print(f"""\n\n\n {'*' * 95}""")