# üìö Literatura.mk - Book Dataset Cleaning & Preprocessing

In [43]:
import pandas as pd
import numpy as np

In [44]:
df= pd.read_csv('literatura_books.csv', encoding='utf-8-sig') 

In [45]:
df.head()

Unnamed: 0,id,title,author,category,price,retrieved_at
0,1,–†–∞–∑–≥–æ–≤–æ—Ä–∏ —Å–æ —ú–µ—Ä–∫–∞ –º–∏ –∑–∞ –µ–∫–æ–Ω–æ–º–∏—ò–∞—Ç–∞ –∏–ª–∏ –∫–∞–∫–æ ...,–à–∞–Ω–∏—Å –í–∞—Ä—É—Ñ–∞–∫–∏—Å,–ü–û–õ–ò–¢–ò–ö–ê,400.0,2025-04-07
1,2,Semper Idem,–É–æ—Ä—ì–µ –õ–µ–±–æ–≤–∏—ú,–ò–°–¢–û–†–ò–°–ö–ê –§–ò–ö–¶–ò–à–ê,500.0,2025-04-07
2,3,–ù–∞ –¥–Ω–æ—Ç–æ: —Ä–∞—Å–∫–∞–∑–∏,–ú–∞–∫—Å–∏–º –ì–æ—Ä–∫–∏,–†–ê–°–ö–ê–ó–ò,400.0,2025-04-07
3,4,–ü–∞–Ω–æ–Ω—Å–∫–∏–æ—Ç –∞–¥–º–∏—Ä–∞–ª: –µ–º–æ—Ü–∏–æ–Ω–∞–ª–µ–Ω –≤–æ–¥–∏—á –Ω–∏–∑ –¥–∏—Å–∫...,–ò–≤–∞–Ω –ò–≤–∞—á–∫–æ–≤–∏—ú,–ë–ò–û–ì–†–ê–§–ò–ò –ò –ú–ï–ú–û–ê–†–ò,490.0,2025-04-07
4,5,–ò–∑–≥—É–±–µ–Ω–∏ –≥–µ—Ä–º–∞–Ω—Ü–∏,–î–µ—ò–∞–Ω –î—É–∫–æ–≤—Å–∫–∏,–î–†–ê–ú–ò,299.0,2025-04-07


### 1. Drop the redundant 'book_id' column

In [46]:
df = df.drop(columns=['id'])
df.to_csv("literatura_books.csv", encoding='utf-8-sig', index=False)
df = pd.read_csv("literatura_books.csv", encoding='utf-8-sig')

### 2. Convert the 'Retrieved At' column to datetime format.

In [47]:
df['retrieved_at'] = pd.to_datetime(df['retrieved_at'])

In [48]:
df.head()

Unnamed: 0,title,author,category,price,retrieved_at
0,–†–∞–∑–≥–æ–≤–æ—Ä–∏ —Å–æ —ú–µ—Ä–∫–∞ –º–∏ –∑–∞ –µ–∫–æ–Ω–æ–º–∏—ò–∞—Ç–∞ –∏–ª–∏ –∫–∞–∫–æ ...,–à–∞–Ω–∏—Å –í–∞—Ä—É—Ñ–∞–∫–∏—Å,–ü–û–õ–ò–¢–ò–ö–ê,400.0,2025-04-07
1,Semper Idem,–É–æ—Ä—ì–µ –õ–µ–±–æ–≤–∏—ú,–ò–°–¢–û–†–ò–°–ö–ê –§–ò–ö–¶–ò–à–ê,500.0,2025-04-07
2,–ù–∞ –¥–Ω–æ—Ç–æ: —Ä–∞—Å–∫–∞–∑–∏,–ú–∞–∫—Å–∏–º –ì–æ—Ä–∫–∏,–†–ê–°–ö–ê–ó–ò,400.0,2025-04-07
3,–ü–∞–Ω–æ–Ω—Å–∫–∏–æ—Ç –∞–¥–º–∏—Ä–∞–ª: –µ–º–æ—Ü–∏–æ–Ω–∞–ª–µ–Ω –≤–æ–¥–∏—á –Ω–∏–∑ –¥–∏—Å–∫...,–ò–≤–∞–Ω –ò–≤–∞—á–∫–æ–≤–∏—ú,–ë–ò–û–ì–†–ê–§–ò–ò –ò –ú–ï–ú–û–ê–†–ò,490.0,2025-04-07
4,–ò–∑–≥—É–±–µ–Ω–∏ –≥–µ—Ä–º–∞–Ω—Ü–∏,–î–µ—ò–∞–Ω –î—É–∫–æ–≤—Å–∫–∏,–î–†–ê–ú–ò,299.0,2025-04-07


### 3. Check for Duplicates
- There are no duplicate entries

In [56]:
duplicates = df[df.duplicated(keep=False)]
duplicates = duplicates.sort_values(by=['title', 'author'])
print("Length of duplicates: ", len(duplicates))

Length of duplicates:  0


### 4. Define a function to normalize author names  
- Inconsistent Formats:
    - Some names are written in the `"LastName, FirstName"` format.
    - Others are written in `"FirstName LastName"` or even as a comma-separated list of multiple authors.

- Normalizing names into a consistent format like `"FirstName LastName"` or `"Author1 Author2 Author3"`.

In [50]:
def normalize_author(author):
    if pd.isna(author):
        return author  
    if ',' in author:
        parts = [part.strip() for part in author.split(',')]
        return ' '.join(parts[::-1])
    return author 

In [51]:
df['author'] = df['author'].apply(normalize_author)

### 5. Replace '?' in author with null

In [52]:
df.loc[df['author'].str.match(r'^[\?=]+$', na=False), 'author'] = np.nan

In [53]:
filtered_df = df[df['author'].str.match(r'^[\?=]+$', na=False)]

### 6. Replace '/' in author with null

In [54]:
df.loc[df['author'] == '/', 'author'] = np.nan

In [None]:
df.to_csv("literatura_books.csv", encoding='utf-8-sig', index=False)