In [3]:
import pandas as pd
import numpy as np
import re

def ascii_check(string):
    try:
        string.encode('ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True
    
#leio todos os livros
books = pd.read_csv('datasets/goodbooks-10k/books.csv')
#selecionando colunas, ordenando e excluindo linhas com valores NaN
books = books.filter(items=['book_id', 'original_title']).sort_values('book_id').dropna()

#leio as tags associadas aos livros
books_tags = pd.read_csv('datasets/goodbooks-10k/book_tags.csv')

#join das duas tabelas
joined_books_tags = pd.merge(books, books_tags, how='left', left_on=['book_id'], right_on=['goodreads_book_id']).filter(items=['book_id', 'original_title', 'tag_id'])

tags = pd.read_csv('datasets/goodbooks-10k/tags.csv')
#so palavras caracteres ascii
tags = tags[tags['tag_name'].apply(lambda x: ascii_check(x))]
#so palavras sem numeros
tags = tags[~tags['tag_name'].apply(lambda x: bool(re.search(r'\d', x)))]

books_tags_names = pd.merge(joined_books_tags, tags).filter(items=['book_id', 'original_title', 'tag_name'])
books_tags_names.sort_values('book_id')


Unnamed: 0,book_id,original_title,tag_name
0,1,Harry Potter and the Half-Blood Prince,to-read
79107,1,Harry Potter and the Half-Blood Prince,owned-books
13441,1,Harry Potter and the Half-Blood Prince,favorites
176160,1,Harry Potter and the Half-Blood Prince,british
256838,1,Harry Potter and the Half-Blood Prince,rereads
256914,1,Harry Potter and the Half-Blood Prince,contemporary
152571,1,Harry Potter and the Half-Blood Prince,audiobooks
151718,1,Harry Potter and the Half-Blood Prince,middle-grade
87867,1,Harry Potter and the Half-Blood Prince,re-read
77840,1,Harry Potter and the Half-Blood Prince,childrens


In [5]:
# use pd.concat to join the new columns with your original dataframe
df = pd.get_dummies(books_tags_names['tag_name'], prefix='tag',sparse=True)
df
# now drop the original 'country' column (you don't need it anymore)
# df.drop(['tag_name'],axis=1, inplace=True)

Unnamed: 0,tag_-,tag_--available-at-raspberrys--,tag_-calif--,tag_-d-c--,tag_-dean,tag_-england-,tag_-fiction,tag_-fictional,tag_-fictitious,tag_-football-,...,tag_ztemp,tag_ztf-tolkien,tag_zts-uk,tag_zuckerberg,tag_zuhause,tag_zweig,tag_zz-ol-cb-read,tag_zz-to-read-fantasy,tag_zzz-to-read-graphic-novels,tag_zzz-to-read-retell-adapt-etc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
books_with_tags_encodings = pd.concat([books_tags_names,df],axis=1)
books_with_tags_encodings.drop(['tag_name'],axis=1, inplace=True)

In [10]:
ratings = pd.read_csv('datasets/goodbooks-10k/ratings.csv')

books_tags_ratings = pd.merge(books_with_tags_encodings, ratings)

Unnamed: 0,book_id,original_title,tag_-,tag_--available-at-raspberrys--,tag_-calif--,tag_-d-c--,tag_-dean,tag_-england-,tag_-fiction,tag_-fictional,...,tag_ztemp,tag_ztf-tolkien,tag_zts-uk,tag_zuckerberg,tag_zuhause,tag_zweig,tag_zz-ol-cb-read,tag_zz-to-read-fantasy,tag_zzz-to-read-graphic-novels,tag_zzz-to-read-retell-adapt-etc
0,1,Harry Potter and the Half-Blood Prince,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Harry Potter and the Order of the Phoenix,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Harry Potter and the Philosopher's Stone,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,Harry Potter and the Prisoner of Azkaban,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,Harry Potter and the Goblet of Fire,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,10,"Harry Potter Collection (Harry Potter, #1-6)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,11,The Hitchhiker's Guide to the Galaxy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,13,The Ultimate Hitchhiker's Guide: Five Complete...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,21,A Short History of Nearly Everything,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,24,Down Under,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
books_tags_ratings