# Identifikation von Texten mit geteilter Autorenschaft

## Data Exploration: booksummaries

Basic data exploration of the booksummaries dataset.

In [175]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [176]:
# Loading the data
data = pd.read_csv(
    './datasets/booksummaries/booksummaries.txt', 
    header=None,
    sep='\t',
    names=['wiki_id', 'firebase_id', 'title', 'author', 'pub_date', 'genres', 'plot']
)

In [177]:
# Number of entries
data.shape

(16559, 7)

In [178]:
# Entries are unique books by ids...
print("Unique Wikipedia IDs:\t", data.wiki_id.unique().size)
print("Unique Firebase IDs:\t", data.firebase_id.unique().size)
# ...but not by title
print("Unique Titles:\t\t", data.title.unique().size)

Unique Wikipedia IDs:	 16559
Unique Firebase IDs:	 16559
Unique Titles:		 16277


In [179]:
# We have some missing values
data.isnull().sum()

wiki_id           0
firebase_id       0
title             0
author         2382
pub_date       5610
genres         3718
plot              0
dtype: int64

### Extracting Genres

In [180]:
# Subset original data
temp = data.loc[:, ['wiki_id', 'genres']]
# drop rows without genres
temp = temp.dropna()
# extract `id: genre` pairs to lists
temp.genres = temp.genres.str.replace('[{}"]', '', regex=True).str.split(', ')
# map each genre <=> book relation to a seperate row
genre_tags = []
for key, row in temp.iterrows():
    book_id = row[0]
    tags = pd.Series(row[1]).str.split(': ')
    for genre_id, genre_name in tags:
        genre_tags += [[book_id, genre_id, genre_name]]


genre_tags = pd.DataFrame(genre_tags)
genre_tags.columns = ['wiki_id', 'genre_id', 'genre_name']

In [181]:
# Extract unique genres
genres = genre_tags.loc[:,['genre_id', 'genre_name']].drop_duplicates()
genres.describe() # Note that there are more unique ids than names ("Mystery" genre)

Unnamed: 0,genre_id,genre_name
count,228,228
unique,228,227
top,/m/08322,Mystery
freq,1,2


In [182]:
# Top 10 Genres
genre_tags.genre_name.value_counts()[0:10]

Fiction                   4747
Speculative fiction       4314
Science Fiction           2870
Novel                     2463
Fantasy                   2413
Children's literature     2122
Mystery                   1396
Young adult literature     825
Suspense                   765
Crime Fiction              753
Name: genre_name, dtype: int64

In [186]:
def genres_for(wiki_id):
    return genre_tags.loc[genre_tags['wiki_id'] == wiki_id]

In [187]:
genres_for(data.wiki_id[0])

Unnamed: 0,wiki_id,genre_id,genre_name
0,620,/m/016lj8,Roman \u00e0 clef
1,620,/m/06nbt,Satire
2,620,/m/0dwly,Children's literature
3,620,/m/014dfn,Speculative fiction
4,620,/m/02xlf,Fiction


### Authors

## Data Exploration: goodbooks-10k

Basic data exploration of the goodbooks-10k dataset.

In [189]:
# Loading the data
r = pd.read_csv( './datasets/goodbooks-10k/ratings.csv' )
tr = pd.read_csv( './datasets/goodbooks-10k/to_read.csv' )
b = pd.read_csv( './datasets/goodbooks-10k/books.csv' )

t = pd.read_csv( './datasets/goodbooks-10k/tags.csv' )
bt = pd.read_csv( './datasets/goodbooks-10k/book_tags.csv')



## Combining the datasets

In [203]:
merged_data = data.merge(b, left_on="title", right_on="original_title")
merged_data.shape

(2939, 30)