In [128]:
import os
import json
import pandas as pd
from collections import defaultdict
from rapidfuzz import fuzz
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
df = pd.read_csv('../CSVs/8_filtered_genres.csv',index_col=0).drop('index',axis=1)

In [3]:
df.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,year,genres,num final genres
0,The Vanished Birds,Simon Jimenez,124205.0,55.18,6.37,1.95,0.36,1.58,2020.0,"['Science Fiction', 'Fantasy', 'Adult']",3
1,The Price of Honor,Jonathan P. Brazee,77253.0,35.35,8.71,2.63,0.71,1.92,2017.0,['Science Fiction'],1
2,The Case of the Baker Street Irregulars,Anthony Boucher,80557.0,32.33,8.41,3.72,1.64,2.08,1940.0,"['Mystery', 'Crime', 'Classics']",3
3,Wildoak,C. C. Harrington,55602.0,74.34,6.92,3.04,1.16,1.87,2022.0,"['Historical Fiction', 'Young Adult']",2
4,The Holiday,T. M. Logan,101767.0,50.3,8.02,3.06,1.12,1.93,2019.0,"['Thriller', 'Mystery', 'Crime', 'Suspense']",4


In [4]:
def get_word_counts_from_json(author, title):
    file_path = f'../word-counts/{author}/{title}/word-counts.json'
    word_counts = read_json(file_path)
    return word_counts

In [5]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

The formatting of the directory is as follows: Within the 'word-counts' folder, there are about 15,000 folders representing authors. In the folder of each author, there is a subfolder for each of their books. And in each book folder, there is one file: A JSON containing the frequency of each word that appears in the book. 

In [6]:
def process_directory_structure(root_dir
    authors = set()
    books = defaultdict(set)
    df = pd.DataFrame()


    for author in os.listdir(root_dir):
        if os.path.isdir(f'{root_dir}/{author}'):
            authors.add(author)
            author_dir = os.path.join(root_dir, author)

            for book in os.listdir(author_dir):
                book_dir = os.path.join(author_dir, book)
                books[author].add(book)

    return books

In [7]:
root_dir = '../word-counts/'
books = process_directory_structure(root_dir)

In [11]:
def add_data_to_dataframe(df, author, book, word_counts):
    word_counts['Author'] = author
    word_counts['Book'] = book
    df = pd.concat(word_counts, ignore_index=True)
    return df

In [27]:
def clean(name):
    to_remove = [':',',','’','?','/']
    for char in to_remove:
        name = name.replace(char, '')
    name = name.lower().replace(' ','-').replace('&','and').replace('.-','-').replace('.','-').strip('-')
    return name

In [42]:
def get_wordcounts(orig_title, orig_author, correct_books, threshold=80):
    # Reformat title and author's name to match folder names
    title = clean(orig_title)
    author = clean(orig_author)
    correct_books_copy = correct_books.copy()
    
    
    # Check if the author's name is in the list of folders at the author level
    if author in correct_books_copy.keys():
        # If found, check the author's folder for the title folder. 
        if title in correct_books[author]:
            return get_word_counts_from_json(author,title)
        else: 
            # Check for title names that are close to the cleaned title name
            for correct_title in correct_books[author]:
                if fuzz.ratio(title, correct_title) > threshold:
                    return get_wordcounts(correct_title, author, correct_books, threshold)
        return -1
    else: 
        # If the exact author's name wasn't found, check for extremely similar authors' names. 
        for correct_author in correct_books_copy.keys():
            if fuzz.ratio(author, correct_author) > 95:
                return get_wordcounts(title, correct_author, correct_books, threshold)
        # If that doesn't work, check for moderately similar authors' names. 
        for correct_author in correct_books_copy.keys():
            if fuzz.ratio(author, correct_author) > threshold:
                return get_wordcounts(title, correct_author, correct_books, threshold)
        return -1

In [14]:
books = process_directory_structure(root_dir)

In [29]:
for i in tqdm(range(len(df))):
    returned =  get_wordcounts(df.iloc[i]['title'],df.iloc[i]['author'],books,80)
    if returned == -1:
        print(f"Did not work for {df.iloc[i]['title']}, by {df.iloc[i]['author']}")

  0%|          | 0/20490 [00:00<?, ?it/s]

Did not work for Queer Intentions: A (Personal) journey through LGBTQ+ culture, by Amelia Abraham


The only book I wasn't able to find was a book whose title in the dataframe includes a subtitle, and whose matching folder does not! An easy fix. 

In [34]:
df.loc[df['author'] == 'Amelia Abraham', 'title'] = "Queer Intentions"

In [35]:
df.loc[df['author'] == 'Amelia Abraham']

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,year,genres,num final genres
18673,Queer Intentions,Amelia Abraham,86726.0,29.31,8.8,3.12,1.13,1.98,2019.0,['Memoir'],1


In [57]:
test_df = df.loc[df['title']=='Return of the Thief']

In [58]:
test_book = get_wordcounts('Return of the Thief','Megan Whalen Turner',books)

In [47]:
type(test_book)

dict

In [64]:
pd.DataFrame(test_book,index=range(1))

Unnamed: 0,the,to,and,of,I,he,a,was,his,in,...,writer,wrung,xanthe,y,yard,yarn,yellowed,youth,zigzagging,Author
0,7395,3039,2696,2136,2115,1800,1695,1474,1416,1383,...,1,1,1,1,1,1,1,1,1,Return of the Thief


In [71]:
test_book['the']

7395

In [73]:
test_book2 = get_wordcounts('The Vanished Birds','Simon Jimenez',books)

In [78]:
pd.DataFrame(test_book2,index=[0])

Unnamed: 0,the,of,to,and,she,her,a,was,he,in,...,yelp,yet-uncharted,yogic,youngest,zeroed-out,zigzags,zipper,zone,zoned,zucar
0,9651,3215,2869,2862,2509,2313,2304,2202,2165,1809,...,1,1,1,1,1,1,1,1,1,1


In [79]:
test_df1 = pd.DataFrame(test_book,index=[0])

In [80]:
test_df2 = pd.DataFrame(test_book2,index=[0])

In [82]:
pd.concat([test_df1,test_df2],ignore_index=True)

Unnamed: 0,the,to,and,of,I,he,a,was,his,in,...,yelp,yet-uncharted,yogic,youngest,zeroed-out,zigzags,zipper,zone,zoned,zucar
0,7395,3039,2696,2136,2115,1800,1695,1474,1416,1383,...,,,,,,,,,,
1,9651,2869,2862,3215,936,2165,2304,2202,1389,1809,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [83]:
test_dict = {'book_title': 'Return of the Thief','book_author':'Megan Whalen Turner'}

In [84]:
test_dict.update(get_wordcounts(test_dict['book_title'],test_dict['book_author'],books))

In [91]:
test_dict['year']

13

In [97]:
df.columns = ['book title', 'book author', 'total words', 'vividness score', 'passive voice',
       'all adverbs', 'ly-adverbs', 'non-ly-adverbs', 'publication year', 'book genres',
       'num genres']

In [98]:
df.head()

Unnamed: 0,book title,book author,total words,vividness score,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,publication year,book genres,num genres
0,The Vanished Birds,Simon Jimenez,124205.0,55.18,6.37,1.95,0.36,1.58,2020.0,"['Science Fiction', 'Fantasy', 'Adult']",3
1,The Price of Honor,Jonathan P. Brazee,77253.0,35.35,8.71,2.63,0.71,1.92,2017.0,['Science Fiction'],1
2,The Case of the Baker Street Irregulars,Anthony Boucher,80557.0,32.33,8.41,3.72,1.64,2.08,1940.0,"['Mystery', 'Crime', 'Classics']",3
3,Wildoak,C. C. Harrington,55602.0,74.34,6.92,3.04,1.16,1.87,2022.0,"['Historical Fiction', 'Young Adult']",2
4,The Holiday,T. M. Logan,101767.0,50.3,8.02,3.06,1.12,1.93,2019.0,"['Thriller', 'Mystery', 'Crime', 'Suspense']",4


In [99]:
test_df = df.head()

In [106]:
test_df = test_df.join(test_df.apply(lambda x:  pd.Series(get_wordcounts(x['book title'], x['book author'],books)),axis=1))

In [107]:
test_df

Unnamed: 0,book title,book author,total words,vividness score,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,publication year,book genres,...,zone,zoned,zoo,zoom,zoomed,zooming,zucar,émigré,émigrés,über
0,The Vanished Birds,Simon Jimenez,124205.0,55.18,6.37,1.95,0.36,1.58,2020.0,"['Science Fiction', 'Fantasy', 'Adult']",...,1.0,1.0,,,,,1.0,,,
1,The Price of Honor,Jonathan P. Brazee,77253.0,35.35,8.71,2.63,0.71,1.92,2017.0,['Science Fiction'],...,3.0,,,2.0,6.0,,,,,
2,The Case of the Baker Street Irregulars,Anthony Boucher,80557.0,32.33,8.41,3.72,1.64,2.08,1940.0,"['Mystery', 'Crime', 'Classics']",...,,,,,,,,7.0,1.0,1.0
3,Wildoak,C. C. Harrington,55602.0,74.34,6.92,3.04,1.16,1.87,2022.0,"['Historical Fiction', 'Young Adult']",...,,,5.0,,,,,,,
4,The Holiday,T. M. Logan,101767.0,50.3,8.02,3.06,1.12,1.93,2019.0,"['Thriller', 'Mystery', 'Crime', 'Suspense']",...,1.0,,,2.0,7.0,4.0,,,,


In [116]:
len(df)

20490

In [None]:
words_df.tail()

In [130]:
first_df = df.iloc[:1000]

In [131]:
first_df = first_df.join(first_df.progress_apply(lambda x: pd.Series(get_wordcounts(x['book title'], x['book author'], books)), axis=1))


  0%|          | 0/1000 [00:00<?, ?it/s]

In [134]:
df_chunks = []
for i in range(20):
    df_chunks.append(df.iloc[1000*i:1000*(i+1)])

In [138]:
df_chunks.append(df.iloc[20000:len(df)])

In [146]:
complete_chunks = []

In [151]:
for chunk in df_chunks[len(complete_chunks):len(df_chunks)]:
    complete_chunks.append(chunk.join(chunk.progress_apply(lambda x: pd.Series(get_wordcounts(x['book title'], x['book author'], books)), axis=1)))

  0%|          | 0/1000 [00:00<?, ?it/s]

MemoryError: Unable to allocate 2.88 MiB for an array with shape (377410,) and data type float64

In [153]:
for chunk in complete_chunks:
    print(chunk.memory_usage)

<bound method DataFrame.memory_usage of                                   book title                      book author  \
0                         The Vanished Birds                    Simon Jimenez   
1                         The Price of Honor               Jonathan P. Brazee   
2    The Case of the Baker Street Irregulars                  Anthony Boucher   
3                                    Wildoak                 C. C. Harrington   
4                                The Holiday                      T. M. Logan   
..                                       ...                              ...   
995                       All Over the Place               Geraldine DeRuiter   
996                             Peacekeepers    James Rosone & Miranda Watson   
997                                  Stamped  Jason Reynolds & Ibram X. Kendi   
998                          The Other Emily                      Dean Koontz   
999                       All Roads End Here                      Dav

In [None]:
second_df = second_df.join(second_df.progress_apply(lambda x: pd.Series(get_wordcounts(x['book title'], x['book author'], books)), axis=1))