# Imports

In [158]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset
from tqdm.auto import tqdm
from datetime import datetime
import re
tqdm.pandas()

# Data Cleaning

Clean the data and remove extra columns.

## 1

### Authors

In [2]:
authors = pd.read_csv("1/authors.csv")
authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654021 entries, 0 to 654020
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   author_id    654021 non-null  int64 
 1   author_name  654019 non-null  object
dtypes: int64(1), object(1)
memory usage: 10.0+ MB


`author_id` is sequential.

In [7]:
authors.sort_values("author_id", inplace=True)
authors.reset_index(drop=True, inplace=True)

In [43]:
authors.to_csv("1_clean/authors.csv", index=False)

### Categories

In [21]:
categories = pd.read_csv("1/categories.csv")
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2775 entries, 0 to 2774
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   category_id    2775 non-null   int64 
 1   category_name  2775 non-null   object
dtypes: int64(1), object(1)
memory usage: 43.5+ KB


In [22]:
categories["category_id"].min(), categories["category_id"].max()

(2, 3390)

Nothing to change.

In [24]:
categories.to_csv("1_clean/categories.csv", index=False)

### Dataset

In [25]:
dataset = load_dataset("csv", data_files="1/dataset.csv", delimiter=",")

Found cached dataset csv (C:/Users/harik/.cache/huggingface/datasets/csv/default-b1d2ec91dd99d993/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
dataset["train"].column_names

['authors',
 'bestsellers-rank',
 'categories',
 'description',
 'dimension-x',
 'dimension-y',
 'dimension-z',
 'edition',
 'edition-statement',
 'for-ages',
 'format',
 'id',
 'illustrations-note',
 'image-checksum',
 'image-path',
 'image-url',
 'imprint',
 'index-date',
 'isbn10',
 'isbn13',
 'lang',
 'publication-date',
 'publication-place',
 'rating-avg',
 'rating-count',
 'title',
 'url',
 'weight']

In [30]:
columns_take = [
    "authors",
    "categories",
    # "isbn13",
    "isbn10",
    "title",
    "lang",
    "publication-date",
    "format",
]
all_columns = dataset["train"].column_names
columns_drop = [column for column in all_columns if column not in columns_take]
dataset["train"] = dataset["train"].remove_columns(columns_drop)

In [33]:
dataset["train"].to_parquet("1_clean/dataset.parquet")

Creating parquet from Arrow format:   0%|          | 0/1110 [00:00<?, ?ba/s]

167608449

In [38]:
dataset["train"][1000]

{'authors': '[1384, 1385]',
 'categories': '[253, 272, 314, 787, 3332, 833, 3378, 834, 3379]',
 'format': 2.0,
 'isbn10': '1847697909',
 'lang': 'en',
 'publication-date': '2012-09-15 00:00:00',
 'title': 'Researching Language Teacher Cognition and Practice : International Case Studies'}

### Formats

In [39]:
formats = pd.read_csv("1/formats.csv")
formats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   format_id    49 non-null     int64 
 1   format_name  49 non-null     object
dtypes: int64(1), object(1)
memory usage: 912.0+ bytes


In [40]:
formats["format_id"].min(), formats["format_id"].max()

(1, 49)

In [42]:
formats.sort_values("format_id", inplace=True)
formats.reset_index(drop=True, inplace=True)

In [44]:
formats.to_csv("1_clean/formats.csv", index=False)

## 2

### Books

```python
columns_take = [
    "authors",
    "categories",
    # "isbn13",
    "isbn10",
    "title",
    "lang",
    "publication-date",
    "format",
]
```

In [55]:
books1 = pd.read_csv("2/book1-100k.csv")
books1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58292 entries, 0 to 58291
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               58292 non-null  int64  
 1   Name             58292 non-null  object 
 2   RatingDist1      58292 non-null  object 
 3   pagesNumber      58292 non-null  int64  
 4   RatingDist4      58292 non-null  object 
 5   RatingDistTotal  58292 non-null  object 
 6   PublishMonth     58292 non-null  int64  
 7   PublishDay       58292 non-null  int64  
 8   Publisher        57798 non-null  object 
 9   CountsOfReview   58292 non-null  int64  
 10  PublishYear      58292 non-null  int64  
 11  Language         20294 non-null  object 
 12  Authors          58292 non-null  object 
 13  Rating           58292 non-null  float64
 14  RatingDist2      58292 non-null  object 
 15  RatingDist5      58292 non-null  object 
 16  ISBN             57746 non-null  object 
 17  RatingDist3 

In [142]:
def clean_rating(rating):
    if pd.isna(rating):
        return rating
    return int(rating.split(":")[-1])

def publish_date(row):
    month, day, year = row["PublishDay"], row["PublishMonth"], row["PublishYear"]
    try:
        date = datetime(year, month=month, day=day)
    except ValueError as e:
        if "year" in str(e) and "out of range" in str(e):
            return pd.NaT
        if "day" in str(e) and "out of range" in str(e):
            day = day - 1
        if "month must be" in str(e):
            month = 12
        try:
            date = datetime(year, month=month, day=day)
        except:
            return pd.NaT
    return date

rename_map = {
    "Id": "id",
    "Name": "title",
    "Authors": "authors",
    "pagesNumber": "pages",
    "PagesNumber": "pages",
    "Language": "language",
    "RatingDist1": "star_1",
    "RatingDist2": "star_2",
    "RatingDist3": "star_3",
    "RatingDist4": "star_4",
    "RatingDist5": "star_5",
    "Rating": "rating",
    "RatingDistTotal": "total_ratings",
    "CountsOfReview": "total_reviews",
    "ISBN": "isbn",
    "publication_date": "publication_date",
}

In [143]:
def preprocess_one(file_path, save = True):
    file_name = file_path.split("/")[-1]
    books = pd.read_csv(file_path)

    for i in range(1, 6):
        books[f"RatingDist{i}"] = books[f"RatingDist{i}"].apply(clean_rating)

    books["RatingDistTotal"] = books["RatingDistTotal"].apply(clean_rating)
    books["publication_date"] = books.apply(publish_date, axis = 1)
    
    books = books.rename(columns=rename_map)
    correct_order = list(rename_map.values())
    books = books[correct_order]
    if save:
        books.to_csv(f"2_clean/{file_name}", index=False)
        return None
    else:
        return books

In [144]:
books_path = os.listdir("2")
books_path = [f"2/{f}" for f in books_path if "book" in f]
len(books_path)

23

In [146]:
for file in tqdm(books_path[14:], desc="Preprocessing..."):
    preprocess_one(file)

Preprocessing...:   0%|          | 0/9 [00:00<?, ?it/s]

### Ratings

Not relavant.

## 3

### Books

In [147]:
books3 = pd.read_csv("3/Books.csv")
books3.info()

  books3 = pd.read_csv("3/Books.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [150]:
books3.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [152]:
rename_map = {
    "ISBN": "isbn",
    "Book-Title": "title",
    "Book-Author": "authors",
    "Year-Of-Publication": "publication_year"
}

books3 = books3.rename(columns=rename_map)
correct_order = list(rename_map.values())
books3 = books3[correct_order]

In [160]:
books3.sample(5)

Unnamed: 0,isbn,title,authors,publication_year
156156,316955116,City Boy,Herman Wouk,1992
135390,373121598,Bedroom Business (Presents Passion) (Harlequin...,Sandra Marton,2001
261976,789473968,Eyewitness: Everest (Eyewitness Books),Rebecca Stephens,2001
118566,333452984,Sphere,Michael Crichton,0
92031,345324145,From Doon with Death,Ruth Rendell,1985


In [185]:
def find_year(stamp):
    if pd.isna(stamp) or str(stamp) == '0':
        return 1000
    stamp = str(stamp)
    stamp = stamp.strip()
    # print(stamp)
    regex = re.compile(r"(\d{4})")
    match_ = regex.findall(stamp)
    if len(match_):
        return int(match_[0])
    return 1000

In [187]:
books3["publication_year"] = books3["publication_year"].apply(find_year)

In [189]:
books3.dtypes

isbn                object
title               object
authors             object
publication_year     int64
dtype: object

In [199]:
books3.to_csv("3_clean/Books.csv", index=False)

### Users

In [191]:
users3 = pd.read_csv("3/Users.csv")
users3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [192]:
users3.sample(10)

Unnamed: 0,User-ID,Location,Age
111123,111124,"stayner, ontario, canada",
129214,129215,"mafraq, mafraq, jordan",30.0
236025,236026,"marina del rey, california, usa",
262225,262226,"apt, vaucluse, france",62.0
261896,261897,"oakland, california, usa",24.0
175547,175548,"greensborough, victoria, australia",46.0
111265,111266,", ,",26.0
206737,206738,"basel, basel, switzerland",
131730,131731,"dortmund, nordrhein-westfalen, germany",
96742,96743,"baldwin, new york, usa",


In [193]:
users3["User-ID"].min(), users3["User-ID"].max()

(1, 278858)

`User-ID` is sequential.

In [196]:
users3.sort_values("User-ID", inplace=True)
rename_map = {
    "User-ID": "user_id",
    "Location": "location",
    "Age": "age",
}

users3 = users3.rename(columns=rename_map)
correct_order = list(rename_map.values())
users3 = users3[correct_order]

In [198]:
users3.to_csv("3_clean/Users.csv", index=False)

### Ratings

In [200]:
ratings3 = pd.read_csv("3/Ratings.csv")
ratings3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [201]:
ratings3.sample(10)

Unnamed: 0,User-ID,ISBN,Book-Rating
740180,178941,3681305929,0
1046842,250405,0140296549,0
996274,238961,0451162072,0
1070926,255943,0752208497,0
817385,198711,0373095600,0
493417,118228,0486295060,0
990006,238120,0451185110,0
579836,139630,014014773X,5
400885,96448,0590863703,6
729301,175984,3446151982,6


In [204]:
ratings3.sort_values("User-ID", inplace=True)
rename_map = {
    "User-ID": "user_id",
    "ISBN": "isbn",
    "Book-Rating": "provided_rating",
}

ratings3 = ratings3.rename(columns=rename_map)
correct_order = list(rename_map.values())
ratings3 = ratings3[correct_order]

In [205]:
ratings3.sample(5)

Unnamed: 0,user_id,isbn,provided_rating
460760,110608,449225046,0
748821,181165,375703632,7
1053982,251613,877849390,0
616323,148966,373123973,6
470532,112559,373242867,0


In [206]:
ratings3.to_csv("3_clean/Ratings.csv", index=False)