# NLP PROJECT - ONTOLOGY OF THE THREE KINGDOMS

In [None]:
#Importing libraries for NLP techniques
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from string import punctuation
import collections

#Utils
from pathlib import Path
import numpy as np
import os

# Defining functions

In [None]:
#Easy-to-use character's name lists generator from a specific noisy file.
#BUG - Some names may have " " at the end, must be removed or load them will fail!
def generateNamesList():
    file = open("Lists/Main Characters.txt", "r");
    charText = file.readlines();
    file.close();
    allList = [];
    ladyList = [];
    menList = [];
    aliasList = [];
    aliasDict = {};
    
    print("Generating lists...");
    
    for line in charText[1:]:
        words = line.split(",");
        if " " in words[len(words)-1]:
                    words.pop(len(words)-1);

        if words[0] != "\n":
            if "lady" in words[0] or "empress" in words[0] or "diaochan" in words[0]:
                ladyList.append(words[0]);
                allList.append(words[0]);
            else:
                if "(" in words[0]:
                    names = words[0].split("(");
                    courtesy = names[1].split(")")[0];
                    aliasDict[names[0]] = courtesy;
                    aliasList.append(names[0]);
                    aliasList.append(courtesy);
                    allList.append(courtesy);
                    menList.append(names[0]);
                    allList.append(names[0]);
                else:
                    menList.append(words[0]);
                    allList.append(words[0]);
                    
#Removing final blank line
    allList.pop(len(allList)-1);
    menList.pop(len(menList)-1);
    ladyList.pop(len(ladyList)-1);
    aliasList.pop(len(aliasList)-1);
    
    np.savetxt("Lists/allNames.txt", allList, fmt="%s");
    print("allNames.txt created!");
    
    np.savetxt("Lists/menList.txt", menList, fmt="%s");
    print("menList.txt created!");
    
    np.savetxt("Lists/ladyList.txt", ladyList, fmt="%s");
    print("ladyList.txt created!");
    
    np.savetxt("Lists/aliasesList.txt", aliasList, fmt="%s");
    print("aliasesList.txt created!");
    
    return allList, menList, ladyList, aliasList, aliasDict;

In [None]:
# Load list from files, you will not have aliasDict, but you probably don't need it.
def loadLists():
    file = open("Lists/allNames.txt", "r");
    allList = file.read().split("\n");
    file = open("Lists/menList.txt", "r");
    menList = file.read().split("\n");
    file = open("Lists/ladyList.txt", "r");
    ladyList = file.read().split("\n");
    file = open("Lists/aliasesList.txt", "r");
    aliasList = file.read().split("\n");
    file.close();
    return allList, menList, ladyList, aliasList;

In [None]:
# (should) Retrieve all phrases related to the character:
# the actual sentece, the one before and the one after
# his/her name appears get merge together.
def infoXchar(charSurn, charName, sourceText):
    rightSurn = False;
    bookmark = 0;
    match = 0;
    end = len(sourceText);
    text = sourceText.copy();
    wantedInfo = [];
    
    for line in text:
        words = line.split();
        for word in words:
            if word == charName and rightSurn:
                rightSurn = False;
                match += 1;
                if bookmark == 0:
                    wantedInfo.append(text[0] + text[1] + text[2]);
                elif bookmark == end-1:
                    wantedInfo.append(text[bookmark-2] + text[bookmark-1] + text[bookmark])
                else:
                    wantedInfo.append(text[bookmark-1] + text[bookmark] + text[bookmark+1])
            elif word == charSurn:
                    rightSurn = True;
            else:
                rightSurn = False;
        bookmark += 1;
    print("Found " + str(match) + " phrases.");
    return wantedInfo;
#Version 2 will also check for aliases, if needed.

In [None]:
# (should) Retrieve all phrases where a certain
# item appears. The output is an array of string
# where each cell is formed by the actual sentece
# which you find the item on, the one before and
# the one after.
def infoXitem(item, text):
    bookmark = 0;
    match = 0;
    end = len(sourceText);
    text = sourceText.copy();
    wantedInfo = [];
    
    for line in text:
        words = line.split();
        for word in words:
            if word == item:
                match += 1;
                if bookmark == 0:
                    wantedInfo.append(text[0] + text[1] + text[2]);
                elif bookmark == end-1:
                    wantedInfo.append(text[bookmark-2] + text[bookmark-1] + text[bookmark])
                else:
                    wantedInfo.append(text[bookmark-1] + text[bookmark] + text[bookmark+1])
        bookmark += 1;
        
    print("Found " + str(match) + " phrases.");
    return wantedInfo;

In [None]:
# Return a dictionary (#appearance, word)
def frequentWord(sourceText, uniqueWords = set(), wordCount = {}):
# sourceText should be an array of string
    infoText = sourceText.copy();
    previousToken = "x";
    for block in infoText:
        tokens = word_tokenize(block);
        for word in tokens:
            if word not in stop_words:
                if previousToken[0].isupper() and word[0].isupper():
                    wordCount[previousToken + " " + word] = wordCount.get(previousToken + " " + word, 0)+1;
                    uniqueWords.add(previousToken + " " + word)
                else:
                    wordCount[word] = wordCount.get(word, 0)+1;
                    uniqueWords.add(word);
            previousToken = word;

    wordFreq = []
    for key, value in wordCount.items():
        wordFreq.append((value, key))
    wordFreq.sort(reverse=True)

    return wordFreq, uniqueWords, wordCount;

In [None]:
# Check semantic field from common words to understand
# what kind of role our character played.
def computeRole(wordFreq, wantData = False):
    warriorness = 0;
    politicness = 0;
    base = len(wordFreq);

    for term in wordFreq:
        if " " in term[1]: #Removing (most) entities
            base -= 1;
        if term[1].lower() in warWords:
            warriorness += term[0];
        if term[1].lower() in poliWords:
            politicness += term[0];

    warriorness = warriorness/(base+1);
    politicness = politicness/(base+1);
    
    if wantData:
        return warriorness, politicness;
    else:
        print("Warriorness: " + str(warriorness) + " | Politicness: " + str(politicness));
        return 0;

# Gathering Helping Lists & Stop Words

In [None]:
#Allocating list
allList = [];
menList = [];
ladyList = [];
aliasList = [];

if Path('Lists/allNames.txt').is_file():
    print("Character's name list already computed.");
    allList, menList, ladyList, aliasList = loadLists();
else:
    allList, menList, ladyList, aliasList, aliasDict = generateNamesList();

#if Path('Lists/eventNames.txt').is_file():
#    print("Event's name list ready.")
#else:
    #generateEventsList();
    #file = open("Lists/Main Events.txt", "r");
    #eventText = file.readlines();
    
file = open("Lists/warWords.txt", "r");
warWords = file.read().split("\n");
file = open("Lists/poliWords.txt", "r");
poliWords = file.read().split("\n");
file.close();

In [None]:
#Adjusting Stop Words
lib_stopWords = set(stopwords.words('english'));
stop_words = lib_stopWords.copy();
stop_words.add("but");

for word in lib_stopWords:
    upWord = word[0].upper() + word[1:];
    stop_words.add(upWord);

for sign in punctuation:
    stop_words.add(sign);
    
stop_words.add("''");
stop_words.add("``");
stop_words.remove(".");

# Choose the Chapters you Want to Explore!

In [None]:
#Loading files
file = open("Dataset/chap001-004.txt", "r");
#file = open("Dataset/chap005-012.txt", "r");
chaps = file.read();
file.close();
text = chaps.split(".");

# Free Searching!
Let's analyze the novel starting from a character or entity you want.<br>
Since you may not know which characters are in the chosen chapters, let's compute a list.<br>

In [None]:
#Gathering & computing values for each main character
warDict = {};
poliDict = {};
warRes = [];
poliRes = [];

for char in allList:
    pers = char.split(" ");
    cSur = (pers[0][0].upper() + pers[0][1:]);
    if pers[len(pers)-1] == " ":
        pers[len(pers)-1] = "";
    if len(pers) == 1:
        cNam = "";
    else:
        cNam = (pers[1][0].upper() + pers[1][1:]);
    print(cSur + " " + cNam);
    charInfo = infoXchar(cSur, cNam, text);
    wordFreq, uniqueWords, wordCount = frequentWord(charInfo, uniqueWords = set(), wordCount = {});
    
#Estimate 'warriorness' and 'politicness' of the character
    warriorness = 0;
    politicness = 0;
    base = len(wordFreq);

    for term in wordFreq:
        if " " in term[1]: #Removing (most) entities
            base -= 1;
        if term[1].lower() in warWords:
            warriorness += term[0];
        if term[1].lower() in poliWords:
            politicness += term[0];

    warriorness = warriorness/(base+1);
    politicness = politicness/(base+1);
    if warriorness != 0 and politicness != 0:
        data = cSur + " " + cNam + ", " + str(warriorness) + ", " + str(politicness);

    warDict[cSur + " " + cNam]= warriorness;
    poliDict[cSur + " " + cNam]= politicness;

The following list will tell you who is present in the chapter and how likely he/she could be a warrior or a politician.

In [None]:
#Sorting results
for key, value in warDict.items():
    warRes.append((value, key));
    
for key, value in poliDict.items():
    poliRes.append((value, key));
    
warRes.sort(reverse=True);
poliRes.sort(reverse=True);

print("WARRIOR")
for val, char in warRes:
    if val != 0:
        print(char + ": " + str(val));

print("\nPOLITICIAN")
for val, char in poliRes:
    if val != 0:
        print(char + ": " + str(val));

## Selecting Character or Item

In [None]:
info = infoXchar("Sun", "Jian", text); #Example Sun Jian
#info = infoXitem("ITEM", text);
wordFreq, uniqueWords, wordCount = frequentWord(info, uniqueWords = set(), wordCount = {})

Now that we have info about who we want, let's answer to the following questions in order to enrich the ontology, keeping in mind the fact that useful informations may show up while looking at something else.
<ul>
    <li>What has he done? -> Which Events has he been involved in?</li>
    <li>What kind of relationships did he develop? -> With whom?</li>
    <li>What did he achieve? -> Which Title has he manage to get?</li>
</ul>
We could just read all retrieved blocks, but they could be many so let's proceed with order...

## WHEN and WHERE we are?

In [None]:
for entry in wordFreq[:20]:
    print(entry);

We can estimate the year and the event our character is acting in by looking at:
<ul>
    <li>Dates, if we are like enought to find one;</li>
    <li>Characters, by looking when they lived;</li>
    <li>Factions, if he/she is cited with something we know we can use info from that other entity to better understand our character.</li>
</ul>
Were you able to answer the question above? Probably not, but don't be afraid: at least we should have got some names, right?

## Entity Analysis

In [None]:
for entry in wordFreq[:100]:
    if " " in entry[1] or entry[1].isupper():
        print(entry);

Now we definetly have something we can work on, maybe we can even answer to the question above!<br>
We know who our character interact with, let's pick one person or item from the list above and try to answer to the following questions.

In [None]:
entity = "ENTITY";

## HOW and WHY did he/she interacts with that particular character?
or
## HOW that Item is related to our character?

While you reading those blocks, you should keep an eye on:
<ul>
    <li>Cities, because for us European spotting Chinese-location just by looking at names can be hard;</li>
    <li>Dialogues, to understand the relationships between characters;</li>
    <li>Titles, but you probably have to guess it by context;</li>
    <li>Attitude towards Faction, to understand the course of the event.</li>
</ul>

In [None]:
i = 0;
for block in info:
    if entity in block:
        i += 1;
        print("CITATION " + str(i));
        print(block);

Were you able to understand something more about the character? I sincerely hope so!<br>
If not keep doing this with other meaningful entities or words that could be related to a job and in the end you'll learn a lot about your character, other entities, the events their acting in etc...All this little block hides what you need to enrich the ontology!<br>
If you really can't understand anything, you can always read all blocks related to your character. Doing so would be faster than reading the whole chapters, but still it could takes some times because you may have tons of blocks.

In [None]:
i = 0;
for block in info:
    i += 1;
    print("Citation " + str(i));
    print(block);