In [1]:
import pandas as pd
import glob
import os
import re
from langdetect import detect
import nltk

In [3]:
extension = 'tsv'
filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [5]:
column_names = ['title', 'series', 'author', 'ratingValue', 'ratingCount', 'plots', 'reviewCount', 'date', 'characters', 'settings', 'url']

In [6]:
dataset = pd.concat([pd.read_csv(f, sep='\t', header=None, names = column_names) for f in filenames], axis = 0)

In [7]:
numbers = list(map(lambda x : int(re.search("[0-9]+" ,x).group(0)), filenames))

In [9]:
dataset["index"] = numbers

In [10]:
ds = dataset.sort_values(by=['index'])

In [11]:
ds.head()

Unnamed: 0,title,series,author,ratingValue,ratingCount,plots,reviewCount,date,characters,settings,url,index
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,6414062.0,"Could you survive on your own in the wild, wit...",374.0,September 14th 2008,"Katniss Everdeen, Peeta Mellark, Cato (Hunger ...","District 12 Panem, Capitol Panem, Panem",https://www.goodreads.com/book/show/2767052-th...,1
0,Harry Potter and the Order of the Phoenix,Harry Potter #5,J.K. Rowling,4.5,2528402.0,There is a door at the end of a silent corrido...,870.0,September 2004,"Sirius Black, Draco Malfoy, Ron Weasley, Petun...","Hogwarts School of Witchcraft and Wizardry, Lo...",https://www.goodreads.com/book/show/2.Harry_Po...,2
0,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,4532078.0,The unforgettable novel of a childhood in a sl...,324.0,May 23rd 2006,"Scout Finch, Atticus Finch, Jem Finch, Arthur ...",Maycomb Alabama,https://www.goodreads.com/book/show/2657.To_Ki...,3
0,Pride and Prejudice,,Jane Austen,4.26,3021524.0,Alternate cover edition of ISBN 9780679783268S...,279.0,October 10th 2000,"Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabet...","United Kingdom, Derbyshire England, England, H...",https://www.goodreads.com/book/show/1885.Pride...,4
0,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,4994637.0,About three things I was absolutely positive.F...,501.0,September 6th 2006,"Edward Cullen, Jacob Black, Laurent, Renee, Be...","Forks Washington, Phoenix Arizona, Washington ...",https://www.goodreads.com/book/show/41865.Twil...,5


In [12]:
#drop rows where there is not plot
ds = ds.dropna(subset = ['plots'])

In [14]:
#check if the plot is in english
def is_english(plot):
    try:
        result = (detect(plot) == 'en')
    except:
        #where the plot is empty
        result = False
    return(result)

In [15]:
#discard not eglish plot
df = ds[list(map(lambda x : is_english(x), list(ds['plots'])))]

In [78]:
plots = df[['plots', 'index']]

In [79]:
plots.tail()

Unnamed: 0,plots,index
0,This is the first complete collection of Joan ...,29995
0,Not everyone can take on the things that go bu...,29996
0,Which are the logical tricks that will let you...,29997
0,Jess Mackey is living a meticulously fabricate...,29999
0,The 10 authors in this bundle would like to th...,30000


## Remove Stop Word

In [34]:
from nltk.corpus import stopwords

In [35]:
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alessandra\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [95]:
from nltk.tokenize import RegexpTokenizer

In [40]:
 nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [36]:
stop_words = set(stopwords.words('english'))

In [178]:
def remove_stop_word(phrases):
    tokenizer = RegexpTokenizer(r'[a-z]+')
    word = tokenizer.tokenize(phrases.lower())
    return [w for w in word if w not in stop_words]

In [179]:
plots['words'] = plots.apply(lambda x : remove_stop_word(x['plots']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plots['words'] = plots.apply(lambda x : remove_stop_word(x['plots']), axis=1)


In [180]:
plots.head()

Unnamed: 0,plots,index,words
0,"Could you survive on your own in the wild, wit...",1,"[could, survive, wild, every, one, make, sure,..."
0,There is a door at the end of a silent corrido...,2,"[door, end, silent, corridor, haunting, harry,..."
0,The unforgettable novel of a childhood in a sl...,3,"[unforgettable, novel, childhood, sleepy, sout..."
0,"Since its immediate success in 1813, Pride and...",4,"[since, immediate, success, pride, prejudice, ..."
0,About three things I was absolutely positive.F...,5,"[three, things, absolutely, positive, first, e..."


In [181]:
words = plots[['words', 'index']]

In [182]:
words.head()

Unnamed: 0,words,index
0,"[could, survive, wild, every, one, make, sure,...",1
0,"[door, end, silent, corridor, haunting, harry,...",2
0,"[unforgettable, novel, childhood, sleepy, sout...",3
0,"[since, immediate, success, pride, prejudice, ...",4
0,"[three, things, absolutely, positive, first, e...",5


In [183]:
words = words.explode(column='words')

In [196]:
vocabulary = words.groupby('words')['index'].apply(list).to_dict()

In [239]:
keys = list(vocabulary.keys())

In [241]:
for k in keys[500:510]:
    print(k, vocabulary[k])

adamantine [23332]
adamantly [28435]
adamat [6384]
adams [156, 283, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 658, 768, 814, 1945, 2923, 3001, 3410, 3464, 3860, 4313, 4639, 4639, 4639, 4639, 4764, 5270, 5790, 6353, 6353, 7057, 7085, 7314, 7461, 8373, 9067, 9272, 9272, 9272, 9564, 9576, 9995, 20632, 20670, 20670, 20905, 20905, 21797, 22517, 22622, 22738, 22831, 23286, 23306, 23390, 23948, 24385, 25093, 25093, 25093, 25093, 25357, 25383, 25795, 25999, 26514, 26775, 27273, 27273, 27273, 27273, 27273, 27273, 27273, 27301, 28436, 28670, 28879, 29428, 29620, 29825]
adamsdebated [9067]
adamson [6414, 6414, 8530]
adan [26775, 27515]
adana [22622]
adapt [300, 1039, 1445, 1880, 2813, 2962, 2962, 3154, 3545, 4262, 4330, 5949, 5980, 6431, 6785, 7071, 8265, 8297, 9140, 9204, 9207, 9628, 9908, 20875, 21009, 21419, 22532, 23188, 23848, 23927, 24134, 24509, 24897, 26022, 26759, 27454, 27824, 28509, 28549, 28742, 28742, 28890, 28912, 29106, 29829]
adaptability [200

In [197]:
#save on file
import json
with open('result.json', 'w') as fp:
    json.dump(vocabulary, fp)