##Answer to Research Question 4##

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/HW2
authors_set = '/content/drive/My Drive/HW2/lighter_authors.json'
books_set = '/content/drive/My Drive/HW2/lighter_books.json'


Mounted at /content/drive
/content/drive/MyDrive/HW2


In [None]:
import pandas as pd
import json
import numpy as np

**Chunking and loading the authors dataset**

In [None]:
# Chunk size
chunk_size = 200000
# Initialize an empty list to store chunks
authors = pd.DataFrame()
# Loop through each chunk and append it to the result dataframe
for chunk in pd.read_json(authors_set, lines=True, chunksize=chunk_size):
    # Select only the required columns from the chunk
    selected_chunk = chunk[["average_rating","ratings_count","id","name","work_ids","works_count"]]
    # Append the selected chunk to the list
    authors = pd.concat([authors, selected_chunk], ignore_index=True)

In [None]:
authors.head(5)

Unnamed: 0,average_rating,ratings_count,id,name,work_ids,works_count
0,4.19,2862064,4,Douglas Adams,"[3078186, 135328, 1877624, 74123, 3078120, 104...",106
1,4.02,1417316,7,Bill Bryson,"[613469, 2305997, 940892, 2611786, 7800569, 31...",75
2,4.53,56159,10,Jude Fisher,"[17150, 808427, 20487307, 90550, 25460625, 171...",14
3,3.79,3302,12,James Hamilton-Paterson,"[4417, 14300808, 14780, 3796968, 44703121, 103...",45
4,3.6,7979,14,Mark Watson,"[13330815, 19109351, 42306244, 72694240, 26291...",61


In [None]:
authors_df=authors

**Data cleaning for authors dataset**

In [None]:
authors_df.dropna(subset=["average_rating","ratings_count","id","name","work_ids","works_count"], inplace=True)

In [None]:
authors_df['name'] = authors_df['name'].str.strip()

In [None]:
authors_df = authors_df[authors_df['name']!=""]

In [None]:
authors_df[(authors_df.average_rating<0) | (authors_df.ratings_count<0)]

Unnamed: 0,average_rating,ratings_count,id,name,work_ids,works_count
253327,-31.0,-2,7159944,cloudyjenn,"[42967430, 42991072, 25516741, 42954314, 25756...",22
337953,2.98,-41,15649273,Iincho,"[52260174, 52912736, 52260220, 70529262, 52260...",13
338249,2.33,-3,15679447,Sein Anji,"[52359028, 52358835]",2
343284,3.0,-5,16050319,Moonlit Stardust,[53488797],1


In [None]:
authors_df = authors_df[(authors_df['average_rating'] >= 0) & (authors_df['ratings_count'] >= 0)]

In [None]:
authors_df= authors_df.loc[~authors_df.apply(lambda row: len(row['work_ids'])!=row['works_count'], axis=1)]

In [None]:
print("Number of Junk Rows cleaned: "+ str(authors['average_rating'].count() - authors_df['average_rating'].count()))

Number of Junk Rows cleaned: 448


**Loading the Books Dataset with chunking**

In [None]:
# Chunk size
chunk_size = 200000
# Initialize an empty list to store chunks
books = pd.DataFrame()
# Loop through each chunk and append it to the result dataframe
for chunk in pd.read_json(books_set, lines=True, chunksize=chunk_size):
    # Select only the required columns from the chunk
    selected_chunk = chunk[["author_id","title"]]
    # Append the selected chunk to the list
    books = pd.concat([books, selected_chunk], ignore_index=True)

In [None]:
books.head(5)

Unnamed: 0,author_id,title
0,1077326,Harry Potter and the Order of the Phoenix (Har...
1,1077326,Harry Potter and the Sorcerer's Stone (Harry P...
2,1077326,Harry Potter and the Chamber of Secrets (Harry...
3,1077326,Harry Potter and the Prisoner of Azkaban (Harr...
4,1077326,Harry Potter and the Goblet of Fire (Harry Pot...


In [None]:
books_df=books

**Data cleaning process for Books Dataset**

In [None]:
books_df.dropna(subset=['author_id', 'title'], inplace=True)

In [None]:
books_df['title'] = books_df['title'].str.strip()

In [None]:
print("Number of Junk Rows cleaned: "+ str(books['author_id'].count() - books_df['author_id'].count()))

Number of Junk Rows cleaned: 0


**Check for Eponymous Authors**

In [None]:
eponymous_authors = authors_df[authors_df.duplicated('name', keep=False)]
if not eponymous_authors.empty:
    print("Eponymous authors found: ")
    print(eponymous_authors['name'])

Eponymous authors found: 
1569             Peter King
4792            David Yates
6414            Paul Graham
7193          Peter  Davies
8639        Peter  Marshall
                ...        
314855        Dimitar Dimov
315853    James C.L. Carson
319671        Erin  Bedford
337525              Cicerón
345485        Erin  Bedford
Name: name, Length: 74, dtype: object


**Function to get author books**

In [None]:
def get_author_books(author_ids):
    author_books = {}
    for author_id in author_ids:
        author_name = authors_df.loc[authors_df['id'] == author_id, 'name'].values[0]
        books = books_df[books_df['author_id'] == author_id]['title'].tolist()
        author_books[author_name] = books
    return author_books

**Top 20 authors by average rating**

In [None]:
top_20_authors = authors_df.nlargest(20, 'average_rating')
top_20_author_ids = top_20_authors['id'].tolist()

*Get books of the top 20 authors*

In [None]:
top_20_author_books = get_author_books(top_20_author_ids)

*Find the longest book title among the books of the top 20 authors*


In [None]:
longest_title = max(max(top_20_author_books.values(), key=len), key=len)

*Find the overall longest book title*

In [None]:
overall_longest_title = max(books_df['title'], key=len)

*Find the shortest book title overall*

In [None]:
shortest_title = min(books_df['title'], key=len)

In [None]:
print("Top 20 authors' books:")
for author, books in top_20_author_books.items():
    print(f"{author}: {books}")

Top 20 authors' books:
James T. Holmes: []
Georges Wellers: ['Un Juif Sous Vichy', 'From Drancy to Auschwitz']
Jessie (Pierce) Trebesch: []
Staci Mauney: []
D.S.  Brown: []
George  Johnston: []
Patience Normoyle: []
Robert Sidney: ['The Poems of Robert Sidney', 'The Poems of Robert Sidney', 'The Poems of Robert Sidney']
Christine K. Fields: []
Ondeane Lourens: []
Bonnie Kelso: []
Nikos Dimitriou: []
J.M. van Zuiden: []
Archimandrite Gabriel: []
Giada Nizzoli: []
Carlo de Incontrera: []
Ayush Ashish: []
Lavelle Carlson: ['Eek! I Hear a Squeak and the Scurrying of Little Feet [With Audio CD]', 'The Frog Who Could Not Croak: Phonemic Awareness Tale #4 (Phonemic Awareness Tales)']
Tiffany Post: []
Brenda DeMoss Lanz: []


In [None]:
print(f"Longest book title among the top 20 authors: {longest_title}")

Longest book title among the top 20 authors: The Poems of Robert Sidney


In [None]:
if longest_title == overall_longest_title:
    print("The longest book title among the top 20 authors is the same as the longest book title overall.")
else:
    print("The longest book title among the top 20 authors is not the same as the longest book title overall.")

The longest book title among the top 20 authors is not the same as the longest book title overall.


In [None]:
print(f"Overall Longest book title is: {overall_longest_title}")

Overall Longest book title is: The New England Primer Issued Prior to 1830: A Bibliographical Checklist for the More Easy Attaining the True Knowledge of This Book, Embellished with a Hundred Cuts and Now Revised, Greatly Improved and Arranged in Two Alaphabets; With Preface, Introd...


In [None]:
print(f"Shortest book title overall: {shortest_title}")

Shortest book title overall: 


***There is a strange observation that, the shortest book title is coming blank, perhaps, we can clean the data where the title is blank and check the results again.***

---



In [None]:
books_df_filtered =books_df[books_df['title']!= '']

In [None]:
shortest_title_checked = min(books_df_filtered['title'], key=len)

In [None]:
print(f"Shortest book title overall: {shortest_title_checked}")

Shortest book title overall: a


***Now there is atleast a letter in the shortest title, still it is a weird title.***

---

