In [35]:
import os
import json
import dask
import dask.dataframe as dd
from collections import defaultdict
from rapidfuzz import fuzz
import pandas as pd

In [2]:
df = dd.read_csv('../CSVs/8_filtered_genres.csv').drop(['Unnamed: 0','index'],axis=1)
df.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,year,genres,num final genres
0,The Vanished Birds,Simon Jimenez,124205.0,55.18,6.37,1.95,0.36,1.58,2020.0,"['Science Fiction', 'Fantasy', 'Adult']",3
1,The Price of Honor,Jonathan P. Brazee,77253.0,35.35,8.71,2.63,0.71,1.92,2017.0,['Science Fiction'],1
2,The Case of the Baker Street Irregulars,Anthony Boucher,80557.0,32.33,8.41,3.72,1.64,2.08,1940.0,"['Mystery', 'Crime', 'Classics']",3
3,Wildoak,C. C. Harrington,55602.0,74.34,6.92,3.04,1.16,1.87,2022.0,"['Historical Fiction', 'Young Adult']",2
4,The Holiday,T. M. Logan,101767.0,50.3,8.02,3.06,1.12,1.93,2019.0,"['Thriller', 'Mystery', 'Crime', 'Suspense']",4


In [3]:
def get_word_counts_from_json(author, title):
    file_path = f'../word-counts/{author}/{title}/word-counts.json'
    word_counts = read_json(file_path)
    return word_counts

In [4]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [5]:
def process_directory_structure(root_dir):
    authors = set()
    books = defaultdict(set)

    for author in os.listdir(root_dir):
        if os.path.isdir(f'{root_dir}/{author}'):
            authors.add(author)
            author_dir = os.path.join(root_dir, author)

            for book in os.listdir(author_dir):
                book_dir = os.path.join(author_dir, book)
                books[author].add(book)

    return books

In [6]:
root_dir = '../word-counts/'
books = process_directory_structure(root_dir)

In [7]:
def add_data_to_dataframe(df, author, book, word_counts):
    word_counts['Author'] = author
    word_counts['Book'] = book
    df = dd.concat(word_counts)
    return df

In [8]:
dd.concat?

In [9]:
def clean(name):
    to_remove = [':',',','’','?','/']
    for char in to_remove:
        name = name.replace(char, '')
    name = name.lower().replace(' ','-').replace('&','and').replace('.-','-').replace('.','-').strip('-')
    return name

In [10]:
def get_wordcounts(orig_title, orig_author, correct_books, threshold=80):
    # Reformat title and author's name to match folder names
    title = clean(orig_title)
    author = clean(orig_author)
    correct_books_copy = correct_books.copy()
    
    
    # Check if the author's name is in the list of folders at the author level
    if author in correct_books_copy.keys():
        # If found, check the author's folder for the title folder. 
        if title in correct_books[author]:
            return get_word_counts_from_json(author,title)
        else: 
            # Check for title names that are close to the cleaned title name
            for correct_title in correct_books[author]:
                if fuzz.ratio(title, correct_title) > threshold:
                    return get_wordcounts(correct_title, author, correct_books, threshold)
        return -1
    else: 
        # If the exact author's name wasn't found, check for extremely similar authors' names. 
        for correct_author in correct_books_copy.keys():
            if fuzz.ratio(author, correct_author) > 95:
                return get_wordcounts(title, correct_author, correct_books, threshold)
        # If that doesn't work, check for moderately similar authors' names. 
        for correct_author in correct_books_copy.keys():
            if fuzz.ratio(author, correct_author) > threshold:
                return get_wordcounts(title, correct_author, correct_books, threshold)
        return -1

In [11]:
df.loc[df['author'] == 'Amelia Abraham'].compute()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,year,genres,num final genres
18673,Queer Intentions,Amelia Abraham,86726.0,29.31,8.8,3.12,1.13,1.98,2019.0,['Memoir'],1


In [12]:
test_df = df.head()
test_df

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,year,genres,num final genres
0,The Vanished Birds,Simon Jimenez,124205.0,55.18,6.37,1.95,0.36,1.58,2020.0,"['Science Fiction', 'Fantasy', 'Adult']",3
1,The Price of Honor,Jonathan P. Brazee,77253.0,35.35,8.71,2.63,0.71,1.92,2017.0,['Science Fiction'],1
2,The Case of the Baker Street Irregulars,Anthony Boucher,80557.0,32.33,8.41,3.72,1.64,2.08,1940.0,"['Mystery', 'Crime', 'Classics']",3
3,Wildoak,C. C. Harrington,55602.0,74.34,6.92,3.04,1.16,1.87,2022.0,"['Historical Fiction', 'Young Adult']",2
4,The Holiday,T. M. Logan,101767.0,50.3,8.02,3.06,1.12,1.93,2019.0,"['Thriller', 'Mystery', 'Crime', 'Suspense']",4


In [13]:
# Some of the columns have names that could be words in books. Better make each column name two words.
df.columns = ['book title', 'book author', 'total words', 'vividness score', 'passive voice',
       'all adverbs', 'ly-adverbs', 'non-ly-adverbs', 'publication year', 'book genres',
       'num genres']

In [70]:
test_df.apply(lambda x:  dd.from_dict(get_wordcounts(x['book title'],x['book author'],books),1,dd.DataFrame),axis=1)


AttributeError: module 'dask.dataframe' has no attribute 'from_dict'

In [58]:
test_df.apply(lambda x: get_wordcounts(x['book title'], x['book author'],books), 1, dd.DataFrame)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [49]:
test_dict[0] = get_wordcounts('Return of the Thief','Megan Whalen Turner',books)

In [50]:
test_dict[1] = get_wordcounts('Thick as Thieves','Megan Whalen Turner',books)

In [53]:
test_list = []
test_list.append(get_wordcounts('Return of the Thief','Megan Whalen Turner',books))
test_list.append(get_wordcounts('Thick as Thieves','Megan Whalen Turner',books))

In [59]:
dd.from_pandas(pd.DataFrame(test_list))

ValueError: Exactly one of npartitions and chunksize must be specified.