In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
import numpy as np
%matplotlib inline
import re 
import math

In [4]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import spacy
# Need to load the large model to get the vectors
import en_core_web_sm
nlp = en_core_web_sm.load()

In [6]:
import os
import pandas as pd

# read the text file and add the column names
read_file = pd.read_csv(r"booksummaries.txt", sep='	', header=None)
read_file.columns = ['ID', 'm number', 'book name', 'author name', 'date', 'label', 'summary']

# clean data
read_file['label'] = read_file['label'].str.replace(r'/m/\S*\s', '')
read_file['label'] = read_file['label'].str.replace(r'{', '')
read_file['label'] = read_file['label'].str.replace(r'}', '')
read_file['label'] = read_file['label'].str.replace(r'}', '')
read_file['label'] = read_file['label'].str.replace(r'\\u00e0\s+clef', '')

# select columns
new_file = read_file.loc[:, ['book name', 'label', 'summary']]

#delete the columns with no labels
new_file.dropna(axis = 0, how = 'any', inplace = True)
new_file = new_file.iloc[:, [0, 2, 1]]

new_file = new_file.reset_index(drop=True)

#output data as csv
new_file.to_csv(r'./booksummries.csv', index=False)


In [7]:
new_file.head()

Unnamed: 0,book name,summary,label
0,Animal Farm,"Old Major, the old boar on the Manor Farm, ca...","""""Roman "", """"Satire"", """"Children's literature""..."
1,A Clockwork Orange,"Alex, a teenager living in near-future Englan...","""""Science Fiction"", """"Novella"", """"Speculative ..."
2,The Plague,The text of The Plague is divided into five p...,"""""Existentialism"", """"Fiction"", """"Absurdist fic..."
3,A Fire Upon the Deep,The novel posits that space around the Milky ...,"""""Hard science fiction"", """"Science Fiction"", ""..."
4,All Quiet on the Western Front,"The book tells the story of Paul Bäumer, a Ge...","""""War novel"", """"Roman """


In [8]:
def text_process(label_list):
    has_fiction = False
    has_spec_fiction = False
    has_novel = False
    has_spec_novel = False
    for i in range(len(label_list)):
        if 'novel' in label_list[i].lower():
            if 'novel' == label_list[i].lower():
                has_novel = True
            else:
                has_spec_novel = True
        if 'fiction' in label_list[i].lower():
            if 'fiction' == label_list[i].lower():
                has_fiction = True
            else:
                has_spec_fiction = True
        
    if has_spec_fiction and has_spec_novel:
        if has_fiction:
            label_list.remove('fiction')
        if has_novel:
            label_list.remove('novel')
    elif has_spec_fiction:
        if has_fiction:
            label_list.remove('fiction')
        if has_novel:
            label_list.remove('novel')
    elif has_spec_novel:
        if has_fiction:
            label_list.remove('fiction')
        if has_novel:
            label_list.remove('novel')
    elif has_fiction and has_novel:
        label_list.remove('fiction')
    return label_list

In [9]:
for index in range(len(new_file['label'])):
    label = new_file['label'][index].replace('"', ''). lower()
    label_list = re.split(', ', label)  
    label_list = text_process(label_list)
    new_file.xs(index)['label']= label_list

In [10]:
#output data as csv
new_file.to_csv(r'./booksummries.csv', index=False)


In [11]:
new_file.tail()

Unnamed: 0,book name,summary,label
12836,The Third Lynx,The story starts with former government agent...,[science fiction]
12837,Remote Control,The series follows the character of Nick Ston...,"[thriller, fiction, suspense]"
12838,Transfer of Power,The reader first meets Rapp while he is doing...,"[thriller, fiction]"
12839,Decoded,The book follows very rough chronological ord...,[autobiography]
12840,Poor Folk,Makar Devushkin and Varvara Dobroselova are s...,"[epistolary novel, speculative fiction]"


  ## Encoding the Labels##




In [12]:
label_list = []
for index in range(len(new_file['label'])):
    object_label = new_file['label'][index]
    for l in object_label:
        if l not in label_list:
            label_list.append(l)

In [13]:
len(label_list)

227

In [14]:
one_hot = pd.DataFrame(np.zeros((12841, 227)), columns=label_list).astype(int)

In [15]:
for index in range(len(new_file['label'])):
    object_label = new_file['label'][index]
    for l in object_label:
        one_hot[l][index] = 1

In [16]:
one_hot

Unnamed: 0,roman,satire,children's literature,speculative fiction,science fiction,novella,utopian and dystopian fiction,existentialism,absurdist fiction,hard science fiction,...,encyclopedia,mashup,biopunk,popular culture,neuroscience,new york times best seller list,epic science fiction and fantasy,alien invasion,prose,pastiche
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12836,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12838,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12839,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split the words##

In [24]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


def words_process(new_file):
    book_summaries = new_file['summary']
    summary_list = [summary for summary in book_summaries]
    summary_num = len(summary_list)
    #summaries = ''.join(summary_list)
    print("the total number of books: {}\n".format(summary_num))
    
    all_docs = []

    for doc in summary_list:
        # Tokenize the string into words
        tokens = word_tokenize(doc)
        # Remove non-alphabetic tokens, such as punctuation
        words = [word.lower() for word in tokens if word.isalpha()]
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if not word in stop_words]
        all_docs.append(words)
        
    return all_docs

In [26]:
all_words = words_process(new_file.iloc[0:3, :]) ####NEED TO CHANGE TO ALL!!!!!!!
all_words

the total number of books: 3



[['old',
  'major',
  'old',
  'boar',
  'manor',
  'farm',
  'calls',
  'animals',
  'farm',
  'meeting',
  'compares',
  'humans',
  'parasites',
  'teaches',
  'animals',
  'revolutionary',
  'song',
  'england',
  'major',
  'dies',
  'two',
  'young',
  'pigs',
  'snowball',
  'napoleon',
  'assume',
  'command',
  'turn',
  'dream',
  'philosophy',
  'animals',
  'revolt',
  'drive',
  'drunken',
  'irresponsible',
  'mr',
  'jones',
  'farm',
  'renaming',
  'animal',
  'farm',
  'adopt',
  'seven',
  'commandments',
  'important',
  'animals',
  'equal',
  'snowball',
  'attempts',
  'teach',
  'animals',
  'reading',
  'writing',
  'food',
  'plentiful',
  'farm',
  'runs',
  'smoothly',
  'pigs',
  'elevate',
  'positions',
  'leadership',
  'set',
  'aside',
  'special',
  'food',
  'items',
  'ostensibly',
  'personal',
  'health',
  'napoleon',
  'takes',
  'pups',
  'farm',
  'dogs',
  'trains',
  'privately',
  'napoleon',
  'snowball',
  'struggle',
  'leadership',
  's

## outliners ##

## Using a Pre-Trained Embedding Layer

In [31]:
# import Word2Vec loading capabilities
from gensim.models import KeyedVectors

# Creating the model
embed_lookup = KeyedVectors.load_word2vec_format('./CNN_Text_Classification/word2vec_model/word2vec-slim/GoogleNews-vectors-negative300.bin', 
                                                 binary=True)

# Tokenize reviews

In [37]:
def tokenize_all_sum(embed_lookup):
    tokenized_sum = []
    for summ in all_words:
        ints = []
        for w in summ:
            try:
                idx = embed_lookup.vocab[w].index
            except: 
                idx = 0
            ints.append(idx)
        tokenized_sum.append(ints)
    return tokenized_sum

In [38]:
tokenized_sum = tokenize_all_sum(embed_lookup)

In [39]:
# testing code and printing a tokenized review
print(tokenized_sum[0])

[154, 338, 154, 86316, 63917, 2563, 926, 2418, 2563, 349, 8691, 5103, 32366, 7328, 2418, 8709, 2216, 146659, 338, 6119, 54, 533, 13250, 37391, 517953, 4315, 3929, 749, 2768, 5868, 2418, 14076, 817, 18500, 11132, 94620, 131280, 2563, 35097, 2801, 2563, 4922, 375, 60262, 396, 2418, 3325, 37391, 2687, 4016, 2418, 1772, 2009, 560, 18855, 2563, 623, 9841, 13250, 18603, 1784, 1274, 211, 3125, 605, 560, 1281, 22341, 764, 492, 517953, 920, 38308, 2563, 2766, 5325, 9223, 517953, 37391, 2894, 1274, 37391, 5513, 383, 857, 45282, 517953, 2766, 5148, 37391, 309, 15156, 614, 517953, 75114, 726, 6481, 2351, 2563, 5226, 1778, 956, 13250, 209, 2563, 527, 533, 13283, 945, 798371, 34858, 517953, 1059, 912, 45282, 931, 2418, 141, 3437, 3102, 2200, 870, 45282, 3148, 1896, 2418, 359, 45282, 66096, 517953, 798371, 5823, 2418, 37391, 3016, 1021, 34168, 0, 2470, 3064, 4406, 5160, 37391, 2540, 29160, 517953, 2219, 50487, 2563, 2766, 1423, 2418, 11092, 174391, 154, 2318, 13250, 2258, 325, 9174, 509, 36632, 12891

In [42]:
max_Length = 0  
for x in tokenized_sum:
    if len(x) > max_Length:
        max_Length = len(x)
max_Length

599

## Padding sequences

In [60]:
def pad_features(tokenized_sum, seq_length):
    
    # getting the correct rows x cols shape
    features = np.zeros((len(tokenized_sum), seq_length), dtype=int)
 
    # for each review, I grab that review and 
    for i, row in enumerate(tokenized_sum):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [66]:
features = pad_features(tokenized_sum, max_Length)

assert len(features)==len(tokenized_sum), "Features should have as many rows as reviews."
assert len(features[0])==max_Length, "Each feature row should contain seq_length values."

print(features[2])

[   2986   17170    4730     161    1286     644       0    1009   14141
    2613   20840   25965     994    2736    1852   26879    7687     774
    8400    2733     290    3828     243    1291     892    5549     208
     869     555    2179   34977   14141    8881    2179   11278    1789
  900125   17170     828    1980 1040078       0     870   11029    2743
     473   24762     473   34562 1671245   58915    6119   10756       0
   41957    7383       0    4696     216    4523   17170    6378     644
    1247    1969    1912     644     892    5003    1702    3217    1166
      45     570     713    2164     966   40575    2540    4004   11185
     892     143   66377       0    1803    2013     768     981   54833
    2375     598     142     581    9616   26419     509    1448    1175
    1828     233    4358   59267   14358     768     605    9620    1046
     825    7676    2217     405      80     255     570    4137    2219
    1027    4983    1448     433     942   28196   

# Training, Validation, and Test Data

In [69]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = one_hot[:split_idx], one_hot[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))


			Feature Shapes:
Train set: 		(2, 599) 
Validation set: 	(0, 599) 
Test set: 		(1, 599)


### Above only 2,0,1 because I only used  3 datasample to do all of these