In [0]:
# Imports ---------------------------------------------------------------------
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import tree, metrics
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.ensemble import AdaBoostClassifier

# Model Class -----------------------------------------------------------------
class Model_SVM():
    # Class variables.
    training_data = None
    training_labels = None
    model_name = None
    model = None
    for_csv = []
    
    def __init__(self, data):
        # Initilzie class variables.
        self.training_data = []
        self.training_labels = []
        self.grid_results = [];

        x = []
        y = []
        for doc in data:
            for row in doc.get_features():
                self.training_data.append(row[1:])
                x.append(row[1])
                y.append(row[2])
            self.training_labels.extend(doc.get_gold_labels())
    
    def train_model(self, parameters):
        clf = SVC(**parameters)
        svm_log_range = [10**x for x in range(-2, 2+1)]
        grid_params = {'gamma': svm_log_range, 'C': svm_log_range}
        self.sv_grid = GridSearchCV(clf, grid_params,cv=3, verbose=3, n_jobs=4)
        print("Training SVM...2")
        self.sv_grid.fit(self.training_data, self.training_labels)
#        clf.fit(self.training_data, self.training_labels)
        
        self.grid_results = {"best_score": self.sv_grid.best_score_,
                                 "best_params": self.sv_grid.best_params_,
                                 "best_estimator": self.sv_grid.best_estimator_,
                                 "final_test_accuracy": 0}
        self.model = self.sv_grid.best_estimator_
    
    def predict(self, doc):
        best_p = [None, 0]
        best_a = [None, 0]
        a = []
        test_data = []
        test_labels = []
        for row in doc:
            for b in row.get_features():
                test_data.append(b[1:])
            test_labels.extend(row.get_gold_labels())
            
            # p = self.model.predict_proba([row[1:]])[0]
            # if best_p[1] < p[2]:
            #     best_p = [row[0], p[2]]
            # if best_a[1] < p[0]:
            #     best_a = [row[0], p[0]]
                
            # a.append([p,row[0]])
        y_pred = self.model.predict(test_data)
        print("SVM Confusion Matrix")
        print(metrics.confusion_matrix(y_true=test_labels, y_pred=y_pred))
        print()
        print("SVM Classification Report")
        print(metrics.classification_report(test_labels, y_pred))
#        sorted_list_a = sorted(a,key= lambda x: x[0][0])
#        sorted_list_p = sorted(a,key=lambda x: x[0][2])
        return [best_p, best_a]
    
    def test(self, test_data):
        acc = 0.0
        data = []
        labels = []
        for doc in test_data:
            for row in doc.get_features():
                data.append(row[1:])
            labels.extend(doc.get_gold_labels())
        return self.model.score(data, labels)

class Model_NB():
    # Class variables.
    training_data = None
    training_labels = None
    model_name = None
    model = None
    for_csv = []
    
    def __init__(self, data):
        # Initilzie class variables.
        self.training_data = []
        self.training_labels = []
        x = []
        y = []
        for doc in data:
            for row in doc.get_features():
                self.training_data.append(row[1:])
                x.append(row[1])
                y.append(row[2])
            self.training_labels.extend(doc.get_gold_labels())
    
    def train_model(self, parameters):
        clf = GaussianNB(**parameters)
        clf.fit(self.training_data, self.training_labels)
        self.model = clf
    
    def predict(self, doc):
        best_p = [None, 0]
        best_a = [None, 0]
        a = []
        test_data = []
        test_labels = []
        for row in doc:
            for b in row.get_features():
                test_data.append(b[1:])
            test_labels.extend(row.get_gold_labels())
            
            # p = self.model.predict_proba([row[1:]])[0]
            # if best_p[1] < p[2]:
            #     best_p = [row[0], p[2]]
            # if best_a[1] < p[0]:
            #     best_a = [row[0], p[0]]
                
            # a.append([p,row[0]])
        y_pred = self.model.predict(test_data)
        print("NB Confusion Matrix")
        print(metrics.confusion_matrix(y_true=test_labels, y_pred=y_pred))
        print()
        print("NB Classification Report")
        print(metrics.classification_report(test_labels, y_pred))
#        sorted_list_a = sorted(a,key= lambda x: x[0][0])
#        sorted_list_p = sorted(a,key=lambda x: x[0][2])
        return [best_p, best_a]
    
    def test(self, test_data):
        acc = 0.0
        data = []
        labels = []
        for doc in test_data:
            for row in doc.get_features():
                data.append(row[1:])
            labels.extend(doc.get_gold_labels())
        return self.model.score(data, labels)

class Model_MLP():
    # Class variables.
    training_data = None
    training_labels = None
    model_name = None
    model = None
    for_csv = []
    
    def __init__(self, data):
        # Initilzie class variables.
        self.training_data = []
        self.training_labels = []
        self.grid_results = []
        x = []
        y = []
        for doc in data:
            for row in doc.get_features():
                self.training_data.append(row[1:])
                x.append(row[1])
                y.append(row[2])
            self.training_labels.extend(doc.get_gold_labels())
    
    def train_model(self, parameters):
        clf = MLPClassifier(**parameters)
        grid_params = {'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15)}
        mlp_grid = GridSearchCV(clf, grid_params,cv=3,verbose=3, n_jobs=4)
        print("Training MLP...")
        mlp_grid.fit(self.training_data, self.training_labels)
#        clf.fit(self.training_data, self.training_labels)
        
        self.grid_results = {"best_score": mlp_grid.best_score_,
                                 "best_params": mlp_grid.best_params_,
                                 "best_estimator": mlp_grid.best_estimator_,
                                 "final_test_accuracy": 0}
        self.model = mlp_grid.best_estimator_
    
    def predict(self, doc):
        best_p = [None, 0]
        best_a = [None, 0]
        a = []
        test_data = []
        test_labels = []
        for row in doc:
            for b in row.get_features():
                test_data.append(b[1:])
            test_labels.extend(row.get_gold_labels())
            
            # p = self.model.predict_proba([row[1:]])[0]
            # if best_p[1] < p[2]:
            #     best_p = [row[0], p[2]]
            # if best_a[1] < p[0]:
            #     best_a = [row[0], p[0]]
                
            # a.append([p,row[0]])
        y_pred = self.model.predict(test_data)
        print("MLP Confusion Matrix")
        print(metrics.confusion_matrix(y_true=test_labels, y_pred=y_pred))
        print()
        print("MLP Classification Report")
        print(metrics.classification_report(test_labels, y_pred))
    
    def test(self, test_data):
        acc = 0.0
        data = []
        labels = []
        for doc in test_data:
            for row in doc.get_features():
                data.append(row[1:])
            labels.extend(doc.get_gold_labels())
        return self.model.score(data, labels)

class Model_Tree():
    # Class variables.
    training_data = None
    training_labels = None
    model_name = None
    model = None
    for_csv = []
    
    def __init__(self, data):
        # Initilzie class variables.
        self.training_data = []
        self.training_labels = []
        x = []
        y = []
        for doc in data:
            for row in doc.get_features():
                self.training_data.append(row[1:])
                x.append(row[1])
                y.append(row[2])
            self.training_labels.extend(doc.get_gold_labels())
    
    def train_model(self, parameters):
        clf = tree.AdaBoostClassifier(**parameters)
        clf.fit(self.training_data, self.training_labels)
        self.model = clf
    
    def predict(self, doc):
        best_p = [None, 0]
        best_a = [None, 0]
        a = []
        test_data = []
        test_labels = []
        for row in doc:
            for b in row.get_features():
                test_data.append(b[1:])
            test_labels.extend(row.get_gold_labels())
            
            # p = self.model.predict_proba([row[1:]])[0]
            # if best_p[1] < p[2]:
            #     best_p = [row[0], p[2]]
            # if best_a[1] < p[0]:
            #     best_a = [row[0], p[0]]
                
            # a.append([p,row[0]])
        y_pred = self.model.predict(test_data)
        print("Tree Confusion Matrix")
        print(metrics.confusion_matrix(y_true=test_labels, y_pred=y_pred))
        print()
        print("Tree Classification Report")
        print(metrics.classification_report(test_labels, y_pred))
    
    def test(self, test_data):
        acc = 0.0
        data = []
        labels = []
        for doc in test_data:
            for row in doc.get_features():
                data.append(row[1:])
            labels.extend(doc.get_gold_labels())
        return self.model.score(data, labels)
    
    
class Model_AdaBoost():
    # Class variables.
    training_data = None
    training_labels = None
    model_name = None
    model = None
    for_csv = []
    
    def __init__(self, data):
        # Initilzie class variables.
        self.training_data = []
        self.training_labels = []
        self.grid_results = []
        x = []
        y = []
        for doc in data:
            for row in doc.get_features():
                self.training_data.append(row[1:])
                x.append(row[1])
                y.append(row[2])
            self.training_labels.extend(doc.get_gold_labels())
    
    def train_model(self, parameters):
        clf = AdaBoostClassifier(**parameters)
        print("Training Adaboost...")
        clf.fit(self.training_data, self.training_labels)
        self.model = clf
    
    def predict(self, doc):
        best_p = [None, 0]
        best_a = [None, 0]
        a = []
        test_data = []
        test_labels = []
        for row in doc:
            for b in row.get_features():
                test_data.append(b[1:])
            test_labels.extend(row.get_gold_labels())
            
            # p = self.model.predict_proba([row[1:]])[0]
            # if best_p[1] < p[2]:
            #     best_p = [row[0], p[2]]
            # if best_a[1] < p[0]:
            #     best_a = [row[0], p[0]]
                
            # a.append([p,row[0]])
        y_pred = self.model.predict(test_data)
        print("AdaBoost Confusion Matrix")
        print(metrics.confusion_matrix(y_true=test_labels, y_pred=y_pred))
        print()
        print("AdaBoost Classification Report")
        print(metrics.classification_report(test_labels, y_pred))
    
    def test(self, test_data):
        acc = 0.0
        data = []
        labels = []
        for doc in test_data:
            for row in doc.get_features():
                data.append(row[1:])
            labels.extend(doc.get_gold_labels())
        return self.model.score(data, labels)

In [0]:
# Installs and Downloads ------------------------------------------------------
# Note: spaCy installation is version 2.1.0.
import nltk
# import spacy
# import neuralcoref
import re
from collections import defaultdict
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tabulate import tabulate
import numpy as np

#nltk.download("popular")
#nltk.download("vader_lexicon")

# Document Class --------------------------------------------------------------
class Document():
    # Class variables.
    title = None
    text = None
    protagonist_g = None
    antagonist_g = None
    ner_tags = None
    sentiments = None
    corefs = None
    
    def __init__(self, title, file, protagonist, antagonist):
        # Initilize class variables.
        self.title = title
        self.protagonist_g = self.entitiy_cleaner(protagonist)
        self.antagonist_g = self.entitiy_cleaner(antagonist)
        self.open_file(file)
        self.generate_features()
    
    def open_file(self, file):
        # Open file.
        try:
            all_lines = []
            line = ""
            with open(file, encoding="utf8") as file:
                header_end = False
                for l in file.readlines():
                    if "*** END OF THIS PROJECT GUTENBERG EBOOK" in l:
                        break
                    if "*** START OF THIS PROJECT GUTENBERG EBOOK" in l:
                        header_end = True
                    if header_end == False:
                        continue
                    if re.search("\w", l) is None:
                        all_lines.append(re.sub("\s+", " ", line.strip()))
                        line = ""
                    else:
                        line += l.strip() + " "
            self.text = []
            for line in all_lines:
                self.text.append(list(nltk.sent_tokenize(line)))
        except Exception as e:
            print("Error: Error opening file.")
            print(e)
            
    def generate_features(self):
        # Generate all features from text.
        self.ner_tags = set()
        self.sentiments = {}
        self.corefs = defaultdict(int)
        nlp = spacy.load("en_core_web_sm")
        neuralcoref.add_to_pipe(nlp)
        
        # Iterate through sentences.
        for par in self.text:
            doc = ""
            for sent in par:
                if len(doc) > 0:
                    doc += " "
                doc += sent
                
                # Get pos and ner tags.
                tokens = nltk.word_tokenize(sent)
                tokens = nltk.pos_tag(tokens)
                tokens = nltk.chunk.ne_chunk(tokens)
                for chunk in tokens:
                    if hasattr(chunk, "label"):
                        if chunk.label() == "PERSON" or chunk.label() == "ORGANIZATION":
                            ent = self.entitiy_cleaner("".join(c[0] for c in chunk))
                            self.ner_tags.add(ent)
                            if ent not in self.sentiments:
                                self.sentiments.update({ent: []})
                            self.sentiments[ent].append(sent)
            
            # Do coreference resolution.
            crefs = defaultdict(int)
            doc = nlp(doc)
            for ent in doc.ents:
                key = self.entitiy_cleaner(ent)
                if ent._.is_coref:
                    crefs[key] += len(list(ent._.coref_cluster))
                else:
                    crefs[key] += 1
            for k in sorted(crefs, key = len, reverse = True):
                in_c = False
                for n in self.ner_tags:
                    if k in n or n in k:
                        in_c = True
                if in_c == False:
                    continue
                in_c = False
                for k_m in crefs:
                    if k in k_m:
                        self.corefs[k_m] += crefs[k]/2
                        in_c = True
                        break
                    if k_m in k:
                        self.corefs[k] += crefs[k_m]/2
                        in_c = True
                        break
                if in_c == False:
                    self.corefs[k] += 1
        total = 0.0
        for ent in self.corefs:
            total += self.corefs[ent]
        for ent in self.corefs:
            self.corefs[ent] = self.corefs[ent]/total
        
        # Do sentiment analysis.
        sid = SentimentIntensityAnalyzer()
        for ent in self.sentiments:
            s = 0
            for sent in self.sentiments[ent]:
#                print(sid.polarity_scores(sent))
                s += sid.polarity_scores(sent)["compound"]
            self.sentiments[ent] = s/len(self.sentiments[ent])
    
    def entitiy_cleaner(self, ent):
        return re.sub("\s+", " ", str(ent).strip()).lower()
    
    def get_features(self):
        features = []
        for ent in self.ner_tags:
            if ent not in self.sentiments or ent not in self.corefs:
                continue
            features.append([str(ent), self.corefs[ent], self.sentiments[ent]])
        return features
    
    def get_gold_labels(self):
        labels = []
        for ent in self.ner_tags:
            if ent not in self.sentiments or ent not in self.corefs:
                continue
            label = ""
            if ent in self.protagonist_g or self.protagonist_g in ent:
                label = "p"
            elif ent in self.antagonist_g or self.antagonist_g in ent:
                label = "a"
            else:
                label = "n"
            labels.append(label)
        return labels
    
    def preds(self):
        cref_keys = sorted(self.corefs, key = lambda k: len(self.corefs[k]), reverse = False)
        for k in self.corefs:
            if "gutenberg" in k:
                del cref_keys[cref_keys.index(k)]
        
        p = cref_keys[-1]
        a = cref_keys[-2]
        pg = self.protagonist_g
        if pg in p or p in pg:
            pg = True
        else:
            pg = False
        ag = self.antagonist_g
        if ag in a or a in ag:
            ag = True
        else:
            ag = False
        values = [
                    ["Protagonist", p, self.protagonist_g, pg], 
                    ["Antagonist", a, self.antagonist_g, ag]
                    ]
        print(self.title)

        return [pg, ag]

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [8]:
import pickle

model_ab = None
model_mlp = None
model_nb = None
model_svm = None
model_tree = None

with open("drive/My Drive/CSCE_5215_ML/Final_Project_Files/Models/ab.pickle", "rb") as file:
  model_ab = pickle.load(file)
with open("drive/My Drive/CSCE_5215_ML/Final_Project_Files/Models/mlp.pickle", "rb") as file:
  model_mlp = pickle.load(file)
with open("drive/My Drive/CSCE_5215_ML/Final_Project_Files/Models/nb.pickle", "rb") as file:
  model_nb = pickle.load(file)
with open("drive/My Drive/CSCE_5215_ML/Final_Project_Files/Models/svm.pickle", "rb") as file:
  model_svm = pickle.load(file)
with open("drive/My Drive/CSCE_5215_ML/Final_Project_Files/Models/tree.pickle", "rb") as file:
  model_tree = pickle.load(file)

ModuleNotFoundError: ignored

In [5]:
# Installs and Downloads ------------------------------------------------------
import pandas as pd
from class_document import Document
# from class_model import Model_SVM, Model_NB, Model_MLP, Model_Tree, Model_AdaBoost
import pickle
import os
from sklearn.model_selection import train_test_split

# Get Data --------------------------------------------------------------------
all_docs = []

# with open("data_labels.csv", "rb") as file:
# #with open("data_labels_2.csv", "rb") as file:
#     labels = pd.read_csv(file)
#     for i, row in labels.iterrows():
#         print(i)
#         try:
#             all_docs.append(Document(row["Book"], "Files/"+row["file"], row["Protagonist"], row["Antagonist"]))
# #            with open("Dataset/%s.pickle"%(row["Book"]), "wb") as file:
#             with open("Dataset/%s.pickle"%(row["Book"]), "wb") as file:
#                 pickle.dump(all_docs[-1], file)
#         except Exception as e:
#             print("Error: No file found "+row["file"]+".")
#             print(e)
#             print()
#         print("File %d Complete"%(i))

for file in os.listdir("Dataset"):
    filename = os.fsdecode(file)
    if filename.endswith(".pickle"):
        print(filename)
#        print(str(os.path.getsize("Dataset/" + filename)))
        try:
            with open("Dataset/" + filename,"rb") as picklefile:
                doc = pickle.load(picklefile)
                all_docs.append(doc)
        except Exception as e:
            print(e)

train_data = []
test_data = []
for i, doc in enumerate(all_docs):
    if i < int(len(all_docs)*0.8)+1:
        train_data.append(doc)
    else:
        test_data.append(doc)

# Train Models ---------------------------------------------------------------
# params_svm = {
#             "probability": True,
#             "kernel": "rbf",
#             "class_weight": "balanced",
#             "gamma": "auto",
#             # "verbose": True
#             }
# model_svm = Model_SVM(train_data)
# model_svm.train_model(params_svm)
# with open("Models/svm.pickle", "wb") as file:
#     pickle.dump(model_svm, file)
    
# params_nb = {
#             }
# model_nb = Model_NB(train_data)
# model_nb.train_model(params_nb)
# with open("Models/nb.pickle", "wb") as file:
#     pickle.dump(model_nb, file)
    
# params_mlp = {
#         "verbose": True
#             }
# model_mlp = Model_MLP(train_data)
# model_mlp.train_model(params_mlp)
# with open("Models/mlp.pickle", "wb") as file:
#     pickle.dump(model_mlp, file)
    
# params_tree = {
#             }
# model_tree = Model_Tree(train_data)
# model_tree.train_model(params_tree)
# with open("Models/tree.pickle", "wb") as file:
#     pickle.dump(model_tree, file)
    
print("SVM:")
# results_svm = model_svm.test(test_data)

predict_svm = model_svm.predict(test_data)
print()
print("NB:")
# results_nb = model_nb.test(test_data)
predict_nb = model_nb.predict(test_data)
print()
print("MLP:")
# results_mlp = model_mlp.test(test_data)
predict_mlp = model_mlp.predict(test_data)

print("Tree:")
# print(model_tree.test(test_data))
predict_tree = model_tree.predict(test_data)

# params_ab = { "n_estimators":10
#             }
# model_ab = Model_AdaBoost(train_data)
# model_ab.train_model(params_ab)
# with open("Models/ab.pickle", "wb") as file:
#     pickle.dump(model_ab, file)
    
print("Adaboost:")
# results_ab = model_ab.test(test_data)
predict_ab = model_ab.predict(test_data)
print()

ModuleNotFoundError: ignored