# HarperDB Book Recommender CSV Cleaner

This notebook cleans the original dataset from [Kaggle](https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset) to make it more useful for this project.

In [None]:
import csv
base_dir = '/home/ubuntu/data'
original_dir = base_dir + '/original'
clean_dir = base_dir + '/clean'

In [None]:
objectNames = ['Users', 'Books', 'Ratings']
for objectName in objectNames:
    print(objectName)
    with open('{}/{}.csv'.format(original_dir, objectName)) as file:
        print(file.readline())
    print()

In [None]:
objectName = 'Users'
with open('{}/{}.csv'.format(clean_dir, objectName.lower()), 'w') as out_file:
    with open('{}/{}.csv'.format(original_dir, objectName)) as in_file:
        reader = csv.reader(in_file)
        writer = csv.writer(out_file)
        row = ['user_id', 'location', 'age']
        writer.writerow(row)
        next(reader)
        c = 0
        for line in reader:
            user_id = int(line[0])
            location = line[1].split(',')[-1].strip().lower()
            age = int(float(line[2])) if len(line[2]) else 0
            row = [user_id, location, age]
            # print(row)
            c += 1
            writer.writerow(row)
            # if c > 10: break


In [None]:
objectName = 'Books'
book_title_isbn = {}
books = []
with open('{}/{}.csv'.format(original_dir, objectName)) as in_file:
    reader = csv.reader(in_file)
    next(reader)
    for line in reader:
        try:
            isbn = line[0]
            title = line[1]
            author = line[2]
            search = title.lower() + ' ' + author.lower()
            year = int(line[3])
            publisher = line[4]
            title_author = title + author
            try:
                isbn = book_title_isbn[title_author]
            except:
                book_title_isbn[title_author] = isbn
                row = [isbn, title, search, author, year, publisher]
            c += 1
            writer.writerow(row)
            # if c > 10: break
        except Exception as exception:
            print(line)
            print(exception)


In [None]:
objectName = 'Books'
book_title_isbn = {}
isbn_map = {}
isbns_all = []
with open('{}/{}.csv'.format(clean_dir, objectName.lower()), 'w') as out_file:
    with open('{}/{}.csv'.format(original_dir, objectName)) as in_file:
        reader = csv.reader(in_file)
        writer = csv.writer(out_file)
        row = ['isbn', 'title', 'search', 'author', 'year', 'publisher']
        writer.writerow(row)
        next(reader)
        for line in reader:
            try:
                isbn = line[0]
                isbns_all.append(isbn)
                title = line[1]
                author = line[2]
                search = title.lower() + ' ' + author.lower()
                year = int(line[3])
                publisher = line[4]
                title_author = title + author                
                try: # check if title_author already added, then add to isbn_map
                    mapped_isbn = book_title_isbn[title_author]
                    isbn_map[isbn] = mapped_isbn
                except: # add title_author and isbn_map to self
                    book_title_isbn[title_author] = isbn
                    isbn_map[isbn] = isbn
                    row = [isbn, title, search, author, year, publisher]
                    writer.writerow(row)
            except Exception as exception:
                print(line)
                print(exception)


In [None]:
objectName = 'Ratings'
n_good = 0
n_bad = 0
not_isbns = []
with open('{}/{}.csv'.format(clean_dir, objectName.lower()), 'w') as out_file:
    with open('{}/{}.csv'.format(original_dir, objectName)) as in_file:
        reader = csv.reader(in_file)
        writer = csv.writer(out_file)
        row = ['user_id', 'isbn', 'rating']
        writer.writerow(row)
        next(reader)
        c = 0
        for line in reader:
            try:
                user_id = int(line[0])
                o_isbn = line[1]
                isbn = isbn_map[o_isbn]
                rating = int(line[2])
                row = [user_id, isbn, rating]
                c += 1
                writer.writerow(row)
                n_good += 1
                # if c > 10: break
            except Exception as exception:
                not_isbns.append(o_isbn)
                # print(line)
                # print(exception)
                # break
                n_bad += 1
print('n_good', n_good)
print('n_bad', n_bad)
