In [7]:
import pandas as pd
df = pd.read_csv('books.csv')
print(df.head(10))
print(df.tail(10))



   Identifier              Edition Statement  \
0         206                            NaN   
1         216                            NaN   
2         218                            NaN   
3         472                            NaN   
4         480   A new edition, revised, etc.   
5         481  Fourth edition, revised, etc.   
6         519                            NaN   
7         667                            NaN   
8         874                            NaN   
9        1143                            NaN   

                  Place of Publication Date of Publication  \
0                               London         1879 [1878]   
1             London; Virtue & Yorston                1868   
2                               London                1869   
3                               London                1851   
4                               London                1857   
5                               London                1875   
6                               Londo

In [11]:
list(df.columns)

['Identifier',
 'Edition Statement',
 'Place of Publication',
 'Date of Publication',
 'Publisher',
 'Title',
 'Author',
 'Contributors',
 'Corporate Author',
 'Corporate Contributors',
 'Former owner',
 'Engraver',
 'Issuance type',
 'Flickr URL',
 'Shelfmarks']

In [46]:
total_books = len(df)
print("Total number of books:", total_books)

Total number of books: 8287


In [44]:
print(df.isnull().sum())

Identifier                   0
Edition Statement         7514
Place of Publication         0
Date of Publication       1759
Publisher                 4195
Title                        0
Author                    1778
Contributors                 0
Corporate Author          8287
Corporate Contributors    8287
Former owner              8286
Engraver                  8287
Issuance type                0
Flickr URL                   0
Shelfmarks                   0
dtype: int64


In [25]:
df[['Identifier', 'Title', 'Author', 'Place of Publication', 'Date of Publication']] 

Unnamed: 0,Identifier,Title,Author,Place of Publication,Date of Publication
0,206,Walter Forbes. [A novel.] By A. A,A. A.,London,1879 [1878]
1,216,All for Greed. [A novel. The dedication signed...,"A., A. A.",London; Virtue & Yorston,1868
2,218,Love the Avenger. By the author of “All for Gr...,"A., A. A.",London,1869
3,472,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",London,1851
4,480,"[The World in which I live, and my place in it...","A., E. S.",London,1857
...,...,...,...,...,...
8282,4158088,"The Parochial History of Cornwall, founded on,...","GIDDY, afterwards GILBERT, Davies.",London,1838
8283,4158128,The History and Gazetteer of the County of Der...,"GLOVER, Stephen - of Derby",Derby,"1831, 32"
8284,4159563,Magna Britannia; being a concise topographical...,"LYSONS, Daniel - M.A., F.R.S., and LYSONS (Sam...",London,[1806]-22
8285,4159587,"An historical, topographical and descriptive v...","Mackenzie, E. (Eneas)",Newcastle upon Tyne,1834


In [61]:
import numpy as np

# Convert the data to numberic
df['Date of Publication'] = pd.to_numeric(df['Date of Publication'], errors='coerce')

# Add a new column to the df called Decade Column
df['Decade'] = (df['Date of Publication'] // 10) * 10

# Updated DataFrame
print("Updated DataFrame with Decade:")
print(df)


Updated DataFrame with Decade:
      Identifier             Edition Statement      Place of Publication  \
0            206                           nan                    London   
1            216                           nan  London; Virtue & Yorston   
2            218                           nan                    London   
3            472                           nan                    London   
4            480  A new edition, revised, etc.                    London   
...          ...                           ...                       ...   
8282     4158088                           nan                    London   
8283     4158128                           nan                     Derby   
8284     4159563                           nan                    London   
8285     4159587                           nan       Newcastle upon Tyne   
8286     4160339                           nan                    London   

      Date of Publication                Publisher  \
0 

In [49]:
history_books = df[df['Title'].str.contains('histor', case=False, na=False)][['Title', 'Author', 'Place of Publication', 'Date of Publication']]
print("Books on history (containing 'history' in the title):")
print(history_books)

Books on history (containing 'history' in the title):
                                                  Title  \
11    Erindringer som Bidrag til Norges Historie fra...   
18    [Historia geográfica, civil y politica de la ...   
22    [Abdollatiphi Historiæ Ægypti compendium. [Wit...   
23    The Comic History of England ... With ... colo...   
24    [The comic history of England ... With twenty ...   
...                                                 ...   
8280  The New Chronicles of England and France, in t...   
8281  Local Records; or, Historical Register of rema...   
8282  The Parochial History of Cornwall, founded on,...   
8283  The History and Gazetteer of the County of Der...   
8285  An historical, topographical and descriptive v...   

                                                 Author Place of Publication  \
11                                         AALL, Jacob.          Christiania   
18    ABBAD Y LASIERRA, Agustín Íñigo - Bishop of...          Puerto-Rico 

In [65]:
# Count the average number of books published by decade
# Group book by decades
book_count_by_decade = df.groupby('Decade')['Title'].count().reset_index()

 # rename the columns
book_count_by_decade.columns = ['Decade', 'Book Count']

# Calculate the average number of books published by decade
average_book_count = book_count_by_decade['Book Count'].mean()

# Display the results
print("\nBook Count by Decade:")
print(book_count_by_decade)

print(f"\nAverage number of books published per decade: {average_book_count:.2f}")


Book Count by Decade:
    Decade  Book Count
0   1540.0           1
1   1590.0           1
2   1600.0           4
3   1620.0           2
4   1630.0          13
5   1640.0           6
6   1650.0           7
7   1660.0          14
8   1670.0          24
9   1680.0          17
10  1690.0          30
11  1700.0          13
12  1710.0          12
13  1720.0          12
14  1730.0          30
15  1740.0          21
16  1750.0          28
17  1760.0          39
18  1770.0          63
19  1780.0          55
20  1790.0          79
21  1800.0         215
22  1810.0         283
23  1820.0         333
24  1830.0         291
25  1840.0         438
26  1850.0         659
27  1860.0         765
28  1870.0         835
29  1880.0         952
30  1890.0        1285
31  1910.0           1

Average number of books published per decade: 204.00


In [77]:
# Filter the rows where 'ValueColumn' > 90

# decades where the number of published books is over 90
filtered_decades = book_count_by_decade[book_count_by_decade['Book Count'] > 90]

# Display the filtered results
print("\nDecades with more than 90 published books:")
print(filtered_decades)




Decades with more than 90 published books:
    Decade  Book Count
21  1800.0         215
22  1810.0         283
23  1820.0         333
24  1830.0         291
25  1840.0         438
26  1850.0         659
27  1860.0         765
28  1870.0         835
29  1880.0         952
30  1890.0        1285


In [87]:
# Sort the filtered results in ascending order by 'Book Count'
sorted_filtered_decades = filtered_decades.sort_values(by='Book Count', ascending=True)

# Display the sorted filtered results
print("\nDecades with more than 90 published books (sorted in ascending order):")
print(sorted_filtered_decades)


Decades with more than 90 published books (sorted in ascending order):
    Decade  Book Count
21  1800.0         215
22  1810.0         283
24  1830.0         291
23  1820.0         333
25  1840.0         438
26  1850.0         659
27  1860.0         765
28  1870.0         835
29  1880.0         952
30  1890.0        1285


In [99]:
# Substitute NaN with 'M'
df.replace('nan', 'M', inplace=True)
print(df)

      Identifier             Edition Statement      Place of Publication  \
0            206                             M                    London   
1            216                             M  London; Virtue & Yorston   
2            218                             M                    London   
3            472                             M                    London   
4            480  A new edition, revised, etc.                    London   
...          ...                           ...                       ...   
8282     4158088                             M                    London   
8283     4158128                             M                     Derby   
8284     4159563                             M                    London   
8285     4159587                             M       Newcastle upon Tyne   
8286     4160339                             M                    London   

     Date of Publication                Publisher  \
0                      M         S

In [101]:
# Save the modified DataFrame to a CSV file
df.to_csv('modified_books.csv', index=False)

print("Modified DataFrame saved as 'modified_books.csv'.")

Modified DataFrame saved as 'modified_books.csv'.


In [103]:
# Zip the CSV file
import zipfile
with zipfile.ZipFile('modified_books.zip', 'w') as zipf:
    zipf.write('modified_books.csv')
