Compiling the wordcount data into one big dataframe technically worked, but the dataframes were too big to fit onto a pandas dataframe, with each chunk of 1000 books having about 38,000 columns--and the columns didn't match up, so combining them would make the dfs even bigger. This might be worth switching from pandas to dask if all of those words had a predictive value to our model, but it seems clear that a word that appears in a book only 1 time, or that appears in only 1 book, is unlikely to help our model. 

In this notebook, I've dropped from each given row every word that appears in a book only once, and then experimented with dropping words that appeared in less than 0.5% of each chunk. 

In [1]:
import os
import json
import pandas as pd
from collections import defaultdict
from rapidfuzz import fuzz
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
df = pd.read_csv('../CSVs/8_filtered_genres.csv',index_col=0).drop('index',axis=1)

In [3]:
df.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,year,genres,num final genres
0,The Vanished Birds,Simon Jimenez,124205.0,55.18,6.37,1.95,0.36,1.58,2020.0,"['Science Fiction', 'Fantasy', 'Adult']",3
1,The Price of Honor,Jonathan P. Brazee,77253.0,35.35,8.71,2.63,0.71,1.92,2017.0,['Science Fiction'],1
2,The Case of the Baker Street Irregulars,Anthony Boucher,80557.0,32.33,8.41,3.72,1.64,2.08,1940.0,"['Mystery', 'Crime', 'Classics']",3
3,Wildoak,C. C. Harrington,55602.0,74.34,6.92,3.04,1.16,1.87,2022.0,"['Historical Fiction', 'Young Adult']",2
4,The Holiday,T. M. Logan,101767.0,50.3,8.02,3.06,1.12,1.93,2019.0,"['Thriller', 'Mystery', 'Crime', 'Suspense']",4


In [32]:
def get_word_counts_from_json(author, title):
    file_path = f'../word-counts/{author}/{title}/word-counts.json'
    word_counts = read_json(file_path)
    # Remove words that only appear once
    pruned_word_counts = {word: number for word, number in word_counts.items() if number > 1}
    return pruned_word_counts

In [5]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [6]:
def process_directory_structure(root_dir):
    authors = set()
    books = defaultdict(set)
    #df = pd.DataFrame()


    for author in os.listdir(root_dir):
        if os.path.isdir(f'{root_dir}/{author}'):
            authors.add(author)
            author_dir = os.path.join(root_dir, author)

            for book in os.listdir(author_dir):
                book_dir = os.path.join(author_dir, book)
                books[author].add(book)

    return books

In [7]:
root_dir = '../word-counts/'
books = process_directory_structure(root_dir)

In [8]:
def add_data_to_dataframe(df, author, book, word_counts):
    word_counts['Author'] = author
    word_counts['Book'] = book
    df = pd.concat(word_counts, ignore_index=True)
    return df

In [9]:
def clean(name):
    to_remove = [':',',','’','?','/']
    for char in to_remove:
        name = name.replace(char, '')
    name = name.lower().replace(' ','-').replace('&','and').replace('.-','-').replace('.','-').strip('-')
    return name

In [10]:
def get_wordcounts(orig_title, orig_author, correct_books, threshold=80):
    # Reformat title and author's name to match folder names
    title = clean(orig_title)
    author = clean(orig_author)
    correct_books_copy = correct_books.copy()
    
    
    # Check if the author's name is in the list of folders at the author level
    if author in correct_books_copy.keys():
        # If found, check the author's folder for the title folder. 
        if title in correct_books[author]:
            return get_word_counts_from_json(author,title)
        else: 
            # Check for title names that are close to the cleaned title name
            for correct_title in correct_books[author]:
                if fuzz.ratio(title, correct_title) > threshold:
                    return get_wordcounts(correct_title, author, correct_books, threshold)
        return -1
    else: 
        # If the exact author's name wasn't found, check for extremely similar authors' names. 
        for correct_author in correct_books_copy.keys():
            if fuzz.ratio(author, correct_author) > 95:
                return get_wordcounts(title, correct_author, correct_books, threshold)
        # If that doesn't work, check for moderately similar authors' names. 
        for correct_author in correct_books_copy.keys():
            if fuzz.ratio(author, correct_author) > threshold:
                return get_wordcounts(title, correct_author, correct_books, threshold)
        return -1

In [35]:
# Some of the columns have names that could be words in books. Better make each column name two words.
df.columns = ['book title', 'book author', 'total words', 'vividness score', 'passive voice',
       'all adverbs', 'ly-adverbs', 'non-ly-adverbs', 'publication year', 'book genres',
       'num genres']

In [37]:
df_chunk = df[:1000]

In [38]:
test_df = df_chunk.join(df_chunk.progress_apply\
                        (lambda x: pd.Series(get_wordcounts(x['book title'], x['book author'], books)), axis=1))


  0%|          | 0/1000 [00:00<?, ?it/s]

In [57]:
test_df.shape

(1000, 162978)

It appears that dropping words that appear only once in each book has taken the number of features for the first 1000 books from 394352 to only 162978. This means that nearly 60% of the words in each book, on average, appear only once. 

Next, let's try removing words that appear in less than 0.1%, 0.2% ... 0.5% of books in the chunk of 1000 books.

We'll take a look at the kind of words we're excluding, and the number of words that are left.

In [99]:
smaller_dfs = [test_df]

In [100]:
for i in range(2,6):
    smaller_dfs.append(test_df.dropna(axis=1,thresh=i))

In [104]:
for i in range(5):
    print(f"A list of words that appear at least {i+1} times includes {len(smaller_dfs[i].columns)-10} words.")
    print(f"The first 10 words are {list(smaller_dfs[i].columns[11:21])}")
    print(f"The last 10 words are {list(smaller_dfs[i].columns[-10:])}\n\n")

A list of words that appear at least 1 times includes 162968 words.
The first 10 words are ['0', '0-10', '0-2-1-7', '0-3', '0-99', '00', '000', '000-Year', '000-a-year', '000-acre']
The last 10 words are ["ó'cuinn", 'óig', 'ôi', 'ösana', 'ù', 'ü', 'über', 'übersecret', 'Ōfuna', 'Ōtsuka']


A list of words that appear at least 2 times includes 76563 words.
The first 10 words are ['0', '00', '000', '000-acre', '000-foot', '000-mile', '000-pound', '000-ton', '002', '007']
The last 10 words are ['è', 'é', 'éclair', 'écus', 'émigré', 'émigrés', 'était', 'étoile', 'êtes', 'über']


A list of words that appear at least 3 times includes 59206 words.
The first 10 words are ['0', '00', '000', '01', '02', '03', '04', '0400', '045', '05']
The last 10 words are ['École', 'Émile', 'Île', 'à', 'ça', 'è', 'éclair', 'émigré', 'émigrés', 'êtes']


A list of words that appear at least 4 times includes 50531 words.
The first 10 words are ['0', '00', '000', '01', '02', '03', '04', '05', '06', '0600']
The l

It appears that nothing of likely predictive value is being lost, and we've reduced the number of features by about a factor of 10. Time to create the complete dataframe!

In [105]:
df_chunks = []
for i in range(20):
    df_chunks.append(df.iloc[1000*i:1000*(i+1)])
df_chunks.append(df.iloc[20000:len(df)])

In [108]:
df_chunks[20]

Unnamed: 0,book title,book author,total words,vividness score,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,publication year,book genres,num genres
20000,Saint Death's Daughter,C. S. E. Cooney,193320.0,64.23,6.36,3.28,1.24,2.04,2022.0,"['Fantasy', 'Young Adult', 'Adult']",3
20001,Resonant Abyss,Chaney Hopper,90352.0,43.71,7.04,3.04,0.89,2.14,2019.0,['Science Fiction'],1
20002,Creation,Gore Vidal,234303.0,35.99,8.73,3.68,1.35,2.32,1981.0,"['Historical Fiction', 'Historical', 'Classics']",3
20003,Dark Island,Matt James,77153.0,50.86,7.38,3.40,1.51,1.88,2018.0,"['Thriller', 'Adventure', 'Horror', 'Science F...",4
20004,Sword of Shadows,Jeri Westerson,83121.0,48.51,7.96,2.89,0.96,1.93,2020.0,"['Mystery', 'Historical Fiction', 'Historical']",3
...,...,...,...,...,...,...,...,...,...,...,...
20485,Blessing in Disguise,Danielle Steel,82682.0,25.47,10.01,3.24,1.03,2.22,2019.0,['Romance'],1
20486,Moth,James Sallis,56267.0,52.27,7.19,3.37,1.10,2.27,1993.0,"['Mystery', 'Crime', 'Thriller']",3
20487,True Blue,Jane Smiley,73746.0,50.61,8.61,3.21,0.56,2.65,2011.0,"['Young Adult', 'Historical Fiction']",2
20488,Cage of Glass,Genevieve Crownson,70143.0,49.93,7.62,3.11,1.23,1.88,,"['Young Adult', 'Science Fiction']",2


In [109]:
complete_chunks = []

In [110]:
for chunk in df_chunks[len(complete_chunks):len(df_chunks)]:
    pruned_chunk = chunk.join(chunk.progress_apply(lambda x: 
                                                   pd.Series(get_wordcounts(x['book title'], x['book author'], 
                                                                            books)), axis=1)).dropna(axis=1,thresh=5)
    complete_chunks.append(pruned_chunk)
    
    

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/490 [00:00<?, ?it/s]

In [111]:
for chunk in complete_chunks:
    print(chunk.shape)

(1000, 45023)
(1000, 45217)
(1000, 42853)
(1000, 42734)
(1000, 43667)
(1000, 44051)
(1000, 42776)
(1000, 43373)
(1000, 44100)
(1000, 43991)
(1000, 44139)
(1000, 44299)
(1000, 44267)
(1000, 43423)
(1000, 43539)
(1000, 44323)
(1000, 44273)
(1000, 44390)
(1000, 43432)
(1000, 39630)
(490, 29334)


In [112]:
complete_df = pd.concat(complete_chunks)

MemoryError: Unable to allocate 160. KiB for an array with shape (1, 20490) and data type float64

Still too big! Time to switch to Dask! 

Managed to save them as csvs before the memory error.