In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
books = pd.read_csv('./BX-Books.csv', encoding='utf_8', sep=';', on_bad_lines='skip', low_memory=False, encoding_errors='replace')
ratings = pd.read_csv('./BX-Book-Ratings.csv', encoding='utf_8', sep=';', on_bad_lines='skip', low_memory=False, encoding_errors='replace')
users = pd.read_csv('./BX-Users.csv', encoding='utf_8', sep=';', on_bad_lines='skip', low_memory=False, encoding_errors='replace')

# Books

In [16]:
books_wrong_recs = books[books['Book-Title'].apply(lambda x: not x.isascii() or not x.isprintable())]
books_wrong_recs

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
177,3257203659,Der illustrierte Mann. Erz�?¤hlungen.,Ray Bradbury,2002,Diogenes,http://images.amazon.com/images/P/3257203659.0...,http://images.amazon.com/images/P/3257203659.0...,http://images.amazon.com/images/P/3257203659.0...
178,3257207522,Der K�?¶nig in Gelb.,Raymond Chandler,1980,Diogenes Verlag,http://images.amazon.com/images/P/3257207522.0...,http://images.amazon.com/images/P/3257207522.0...,http://images.amazon.com/images/P/3257207522.0...
180,3257208634,Die Mars- Chroniken. Roman in Erz�?¤hlungen.,Ray Bradbury,1981,Diogenes Verlag,http://images.amazon.com/images/P/3257208634.0...,http://images.amazon.com/images/P/3257208634.0...,http://images.amazon.com/images/P/3257208634.0...
181,3257208669,Das B�?¶se kommt auf leisen Sohlen.,Ray Bradbury,2003,Diogenes,http://images.amazon.com/images/P/3257208669.0...,http://images.amazon.com/images/P/3257208669.0...,http://images.amazon.com/images/P/3257208669.0...
182,3257210450,L�?¶wenzahnwein. Roman.,Ray Bradbury,1999,Diogenes Verlag,http://images.amazon.com/images/P/3257210450.0...,http://images.amazon.com/images/P/3257210450.0...,http://images.amazon.com/images/P/3257210450.0...
...,...,...,...,...,...,...,...,...
271330,3320016822,Urteil ohne Prozess: Margot Honecker gegen Oss...,Jörn Kalkbrenner,1990,Dietz,http://images.amazon.com/images/P/3320016822.0...,http://images.amazon.com/images/P/3320016822.0...,http://images.amazon.com/images/P/3320016822.0...
271331,3423200944,"Hokuspokus, liebe mich. Und f�?¼nfzig andere Z...",Helen Glisic,1997,Dtv,http://images.amazon.com/images/P/3423200944.0...,http://images.amazon.com/images/P/3423200944.0...,http://images.amazon.com/images/P/3423200944.0...
271334,3548740146,Wicca. Eine Einf�?¼hrung in wei�?�?e Magie.,Scott Cunningham,2001,"Ullstein TB-Vlg., B.",http://images.amazon.com/images/P/3548740146.0...,http://images.amazon.com/images/P/3548740146.0...,http://images.amazon.com/images/P/3548740146.0...
271335,381440176X,"Ein Fall f�?¼r TKKG, Bd.50, Sklaven f�?¼r Wutawia",Stefan Wolf,1989,Pelikan,http://images.amazon.com/images/P/381440176X.0...,http://images.amazon.com/images/P/381440176X.0...,http://images.amazon.com/images/P/381440176X.0...


In [17]:
books.drop(books_wrong_recs.index, inplace=True)

In [18]:
books[books['Book-Title'].apply(lambda x: not x.isascii() or not x.isprintable())] # prev line functionality check

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L


In [19]:
books[books['Year-Of-Publication'].apply(lambda x: not x.isnumeric())]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,


In [20]:
# swapping Book-Author and Year-Of-Publication
books['Book-Author'].replace('2000', 'DK Publishing Inc', inplace=True)
books['Year-Of-Publication'].replace('DK Publishing Inc', '2000', inplace=True)

In [21]:
books[books['Year-Of-Publication'].apply(lambda x: not x.isnumeric())]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L


In [22]:
wrong_years = books[books['Year-Of-Publication'].apply(lambda x: int(x) < 1900 or int(x) > 2005)]
wrong_years

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
176,3150000335,Kabale Und Liebe,Schiller,0,"Philipp Reclam, Jun Verlag GmbH",http://images.amazon.com/images/P/3150000335.0...,http://images.amazon.com/images/P/3150000335.0...,http://images.amazon.com/images/P/3150000335.0...
188,342311360X,Die Liebe in Den Zelten,Gabriel Garcia Marquez,0,Deutscher Taschenbuch Verlag (DTV),http://images.amazon.com/images/P/342311360X.0...,http://images.amazon.com/images/P/342311360X.0...,http://images.amazon.com/images/P/342311360X.0...
288,0571197639,Poisonwood Bible Edition Uk,Barbara Kingsolver,0,Faber Faber Inc,http://images.amazon.com/images/P/0571197639.0...,http://images.amazon.com/images/P/0571197639.0...,http://images.amazon.com/images/P/0571197639.0...
351,3596214629,"Herr Der Fliegen (Fiction, Poetry and Drama)",Golding,0,Fischer Taschenbuch Verlag GmbH,http://images.amazon.com/images/P/3596214629.0...,http://images.amazon.com/images/P/3596214629.0...,http://images.amazon.com/images/P/3596214629.0...
542,8845229041,Biblioteca Universale Rizzoli: Sulla Sponda De...,P Coelho,0,Fabbri - RCS Libri,http://images.amazon.com/images/P/8845229041.0...,http://images.amazon.com/images/P/8845229041.0...,http://images.amazon.com/images/P/8845229041.0...
...,...,...,...,...,...,...,...,...
270794,014029953X,Foe (Essential.penguin S.),J.M. Coetzee,0,Penguin Books Ltd,http://images.amazon.com/images/P/014029953X.0...,http://images.amazon.com/images/P/014029953X.0...,http://images.amazon.com/images/P/014029953X.0...
270913,0340571187,Postmens House,Maggie Hemingway,0,Trafalgar Square,http://images.amazon.com/images/P/0340571187.0...,http://images.amazon.com/images/P/0340571187.0...,http://images.amazon.com/images/P/0340571187.0...
271094,8427201079,El Misterio De Sittaford,Agatha Christie,0,Editorial Molino,http://images.amazon.com/images/P/8427201079.0...,http://images.amazon.com/images/P/8427201079.0...,http://images.amazon.com/images/P/8427201079.0...
271182,0887781721,Tom Penny,Tony German,0,P. Martin Associates,http://images.amazon.com/images/P/0887781721.0...,http://images.amazon.com/images/P/0887781721.0...,http://images.amazon.com/images/P/0887781721.0...


In [23]:
books.drop(wrong_years.index, inplace=True)

In [24]:
books[books['Year-Of-Publication'].apply(lambda x: int(x) < 1900 or int(x) > 2005)]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L


In [25]:
books.to_csv('BX-Books-cleaned.csv', encoding='utf_8', sep=';', index=False) # saving cleaned data

# Ratings

In [27]:
ratings_wrong_recs = ratings[ratings['ISBN'].apply(lambda x: not x.isascii() or not x.isprintable())]
ratings_wrong_recs

Unnamed: 0,User-ID,ISBN,Book-Rating
58208,11676,8475560806�,6
124045,28537,349800924�,8
164558,36369,8475560806�,7
235644,54828,393�704064,10
252687,57850,225315398�,3
292012,69558,7544223434/I�,8
353525,85250,�423350229,0
357163,86103,�3499128624,8
358139,86219,349800924�,8
359006,86337,L�BBE01720/2,0


In [28]:
ratings.drop(ratings_wrong_recs.index, inplace=True)

In [29]:
ratings[ratings['ISBN'].apply(lambda x: not x.isascii() or not x.isprintable())] # prev line functionality check

Unnamed: 0,User-ID,ISBN,Book-Rating


In [30]:
ratings.to_csv('BX-Book-Ratings-cleaned.csv', encoding='utf_8', sep=';', index=False) # saving cleaned data

# Users

In [32]:
users_wrong_recs = users[users['Location'].apply(lambda x: not x.isascii() or not x.isprintable())]
users_wrong_recs

Unnamed: 0,User-ID,Location,Age
42,43,"m�xico, m�xico city, distrito federal",
170,171,"ita�, s�o paulo, brazil",29.0
181,182,"buitenpost, frysl�n, netherlands",36.0
250,251,"valladolid, castilla y le�n, spain",54.0
267,268,"�rhus, \n/a\"", denmark""",
...,...,...,...
278746,278747,"sundsvall, v�sternorrland, sweden",20.0
278777,278778,"k�ln, nordrhein-westfalen, germany",42.0
278810,278811,"����, ����, china",19.0
278844,278845,"j�rvenp��, uusimaa, finland",


In [33]:
users.drop(users_wrong_recs.index, inplace=True)

In [34]:
users[users['Location'].apply(lambda x: not x.isascii() or not x.isprintable())] # prev line functionality check

Unnamed: 0,User-ID,Location,Age


In [35]:
users.to_csv('BX-Users-cleaned.csv', encoding='utf_8', sep=';', index=False) # saving cleaned data