In [None]:
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from sklearn import svm, datasets
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.utils.multiclass import unique_labels
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
import gensim
import numpy as np
import re
import pandas as pd
import nltk
%matplotlib inline

# Preprocessing the data

In [None]:
# Lists to store the different narrative parts.
sent = []
begin = []
mid = []
end = []
beginStory = []
midStory = []
endStory = []

# West African corpus
filename = "input_afr.txt"
with open(filename, 'r') as open_file:
    txt_afr = open_file.read()
    afr = txt_afr.split('\n\n')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#Tokenize each sentence.
for story in afr:
    sent.append('\n'.join(tokenizer.tokenize(story)))

# Split at 25%.
for story in sent:
    num_sen = len(story.split("\n"))
    begin.append(story.splitlines()[:round(num_sen/10*2.5)])
    mid.append(story.splitlines()[round(num_sen/10*2.5):round(num_sen/10*7.5)])
    end.append(story.splitlines()[round(num_sen/10*7.5):])

# Run this instead of the previous block if you want to test the "split at 10%".
# for story in sent:
#     num_sen = len(story.split("\n"))
#     if round(num_sen/10*1) > 0:
#         begin.append(story.splitlines()[:round(num_sen/10*1)])
#     elif round(num_sen/10*1) == 0:
#         begin.append(story.splitlines()[:round(1)])
#     mid.append(story.splitlines()[round(num_sen/10*1):round(num_sen/10*9)])
#     end.append(story.splitlines()[round(num_sen/10*9):])

In [None]:
# Create new list of strings.

def stringList(oldList,newList):
    for story in oldList:
        newList.append(' '.join(story))
        
stringList(begin, beginStory)
stringList(mid, midStory)
stringList(end, endStory)

In [None]:
# Steps to preprocess the texts from the corpus.

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_stories(stories):
    default_stop_words = nltk.corpus.stopwords.words('english')
    stopwords = set(default_stop_words)
    stories = [REPLACE_NO_SPACE.sub("", line.lower()) for line in stories]
    stories = [REPLACE_WITH_SPACE.sub(" ", line) for line in stories]
    stories = [RemoveStopWords(line,stopwords) for line in stories]
    return stories

def RemoveStopWords(line, stopwords):
    words = []
    for word in line.split(" "):
        word = word.strip()
        if word not in stopwords and word != "" and word != "&":
            words.append(word)
    return " ".join(words)

begin = preprocess_stories(beginStory)
mid = preprocess_stories(midStory)
end = preprocess_stories(endStory)

In [None]:
# Define the labels and concatenate the lists of the narrative parts 

labels = [0] * len(begin) + [1] * len(mid) + [2] * len(end)
stories = begin + mid + end

# Store the words of the folk tales.

words = [word for word in story.split() for story in stories]
print("The total word count is:", len(words))

# Store folk tales and labels in dataframe.

df = pd.DataFrame(stories,columns=['stories'])
df["labels"] = labels

In [None]:
# Define the input and output variables.

X = df.stories
y = df.labels

# Split dataset in train and test set and tag the output variables.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
my_tags = ["begin", "mid", "end"]

# Train Naive Bayes classifier

In [None]:
# Train the NB classifier using 10-Fold Cross Validation.

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0))
#               ('clf', MultinomialNB()),
              ])
nb.fit(X, y)

cv = KFold(n_splits=10, shuffle=True)
scores_avg = []

for i in range(10):
    scores = cross_val_score(nb, X, y, cv=cv, scoring='accuracy')
    scores_avg.append(scores.mean())
np.array(scores_avg).mean()

print("Accuracy: %0.4f" % (np.array(scores_avg).mean()))

In [None]:
# Use the train/test splits to produce the confusion matrices.

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

In [None]:
# Plot confusion matrix.

y_test = y_test.values
class_names = np.array(["begin","mid","end"])
def plot_confusion_matrix(y_true, y_pred, classes,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    fig.savefig('temp.png', dpi=1000)
    return ax

np.set_printoptions(precision=2)

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix')
plt.show()

# Train the SVM and LR classifiers

In [None]:
# Train the SVM classifier using 10-Fold Cross Validation.

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()),
               ])
sgd.fit(X, y)

cv = KFold(n_splits=10, shuffle=True)
scores_avg = []
for i in range(10):
    scores = cross_val_score(sgd, X, y, cv=cv, scoring='accuracy')
    scores_avg.append(scores.mean())
np.array(scores_avg).mean()
print("Accuracy: %0.4f" % (np.array(scores_avg).mean()))

In [None]:
# Train the LR classifier using 10-Fold Cross Validation.

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression()),
               ])
logreg.fit(X_train, y_train)

cv = KFold(n_splits=10, shuffle=True)
scores_avg = []
for i in range(10):
    scores = cross_val_score(logreg, X, y, cv=cv, scoring='accuracy')
    scores_avg.append(scores.mean())
np.array(scores_avg).mean()
print("Accuracy: %0.4f" % (np.array(scores_avg).mean()))

# Train the Word2Vec classifier with LR

In [None]:
#Train the Word2Vec classifier using 10-Fold Cross Validation.

wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [None]:
# Split train and test set.

train, test = train_test_split(df, test_size=0.0, random_state = 42)
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['stories']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)

In [None]:
# Fit LR classifier and apply 10-Fold Cross Validation.

logreg2 = LogisticRegression(n_jobs=1, C=1e5)
logreg2 = logreg2.fit(X_train_word_average, train['labels'])


cv = KFold(n_splits=10, shuffle=True)
scores_avg = []
for i in range(10):
    scores = cross_val_score(logreg2, X_train_word_average, train['labels'], cv=cv, scoring='accuracy')
    scores_avg.append(scores.mean())
print("Accuracy: %0.4f" % (np.array(scores_avg).mean()))

# Train the TF classifier with LR

In [None]:
stop_words = ['a', 'an', 'the']

def build_model(mode):
    vect = None
    n=3
    if mode == 'count':
        vect = CountVectorizer()
    elif mode == 'tf':
        vect = TfidfVectorizer(use_idf=False, stop_words=["the", "a", "an"], norm='l2')
    else:
        raise ValueError('Mode should be either count or tfidf')
    
    return Pipeline([
        ('vect', vect),
        ('clf' , LogisticRegression(solver='newton-cg',n_jobs=-1))
    ])

def pipeline(x, y, mode):
    processed_x = x
    scores_avg = []
    model_pipeline = build_model(mode)
    cv = KFold(n_splits=10, shuffle=True)
    for i in range(10):
        scores = cross_val_score(model_pipeline, processed_x, y, cv=cv, scoring='accuracy')
        scores_avg.append(scores.mean())
    print("Accuracy: %0.4f" % (np.array(scores_avg).mean()))
    return model_pipeline

In [None]:
x = X
model_pipeline = build_model(mode='count')
model_pipeline.fit(x, y)

In [None]:
import collections
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold

print('Using Count Vectorizer------')
model_pipeline = pipeline(X_train, y_train, mode='count')

print('Using TF Vectorizer------')
model_pipeline = pipeline(X_train, y_train, mode='tf')