# üìö Ikona.mk - Book Dataset Cleaning & Preprocessing

In [41]:
import pandas as pd
import numpy as np

In [42]:
df= pd.read_csv('../data/original_datasets/ikona_books.csv', encoding='utf-8-sig') 

In [43]:
df.head()

Unnamed: 0,id,title,author,real_price,sale_price,sale,category,retrieved_at
0,1,–ê–£–¢–ò–°–¢–û–¢ –ò –ì–£–õ–ê–ë–û–¢ –ü–ò–°–ú–û–ù–û–°–ï–¶,–†–æ–¥–∞–Ω –ê–ª –ì–∞–ª–∏–¥–∏,449,399 –¥–µ–Ω,1,–†–æ–º–∞–Ω—Å–∞,2025-05-17
1,2,–ë–ï–ó–ù–ê–î–ï–ñ–ù–û,–ö–æ–ª–∏–Ω –•—É–≤–µ—Ä,450 –¥–µ–Ω,,0,–†–æ–º–∞–Ω—Å–∞,2025-05-17
2,3,–í–ò–†–¢–£–û–ó,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,449 –¥–µ–Ω,,0,–†–æ–º–∞–Ω—Å–∞,2025-05-17
3,4,–ì–û–°–ü–û–É–ê –ê–¢–ê–¢–£–†–ö –ü—Ä–≤–∞—Ç–∞ –¥–∞–º–∞ –Ω–∞ –º–æ–¥–µ—Ä–Ω–∞ –¢—É—Ä—Ü–∏—ò–∞,–ò–ø–µ–∫ –ß–∞–ª–∏—à–ª–∞—Ä,799 –¥–µ–Ω,,0,–†–æ–º–∞–Ω—Å–∞,2025-05-17
4,5,–î–ê–í–ï–ù–ò–ö,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,559 –¥–µ–Ω,,0,–†–æ–º–∞–Ω—Å–∞,2025-05-17


In [44]:
df = df.drop('id', axis=1)

### 1. Converting the 'Sale' Column to Boolean & rename it

In [45]:
df["sale"] = df["sale"].astype(bool)

In [46]:
df = df.rename(columns={"sale": "IsOnSale"})

In [47]:
df.head()

Unnamed: 0,title,author,real_price,sale_price,IsOnSale,category,retrieved_at
0,–ê–£–¢–ò–°–¢–û–¢ –ò –ì–£–õ–ê–ë–û–¢ –ü–ò–°–ú–û–ù–û–°–ï–¶,–†–æ–¥–∞–Ω –ê–ª –ì–∞–ª–∏–¥–∏,449,399 –¥–µ–Ω,True,–†–æ–º–∞–Ω—Å–∞,2025-05-17
1,–ë–ï–ó–ù–ê–î–ï–ñ–ù–û,–ö–æ–ª–∏–Ω –•—É–≤–µ—Ä,450 –¥–µ–Ω,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17
2,–í–ò–†–¢–£–û–ó,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,449 –¥–µ–Ω,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17
3,–ì–û–°–ü–û–É–ê –ê–¢–ê–¢–£–†–ö –ü—Ä–≤–∞—Ç–∞ –¥–∞–º–∞ –Ω–∞ –º–æ–¥–µ—Ä–Ω–∞ –¢—É—Ä—Ü–∏—ò–∞,–ò–ø–µ–∫ –ß–∞–ª–∏—à–ª–∞—Ä,799 –¥–µ–Ω,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17
4,–î–ê–í–ï–ù–ò–ö,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,559 –¥–µ–Ω,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17


### 2. Convert the 'Retrieved At' column to datetime format.

In [48]:
df['retrieved_at'] = pd.to_datetime(df['retrieved_at'])

### 3. Prices
- Before converting to numeric,remove "–¥–µ–Ω", remove commas and strip whitespace

In [49]:
df['real_price'] = df['real_price'].str.replace("–¥–µ–Ω", "", regex=False).str.replace(",", "").str.strip()
df['sale_price'] = df['sale_price'].str.replace("–¥–µ–Ω", "", regex=False).str.replace(",", "").str.strip()

df["real_price"] = pd.to_numeric(df["real_price"], errors="coerce")
df["sale_price"] = pd.to_numeric(df["sale_price"], errors="coerce")

### 4. Remove duplicates

In [50]:
df = df.dropna(subset=['real_price'])

In [51]:
duplicates = df[df.duplicated(keep=False)]
duplicates = duplicates.sort_values(by=['title', 'author'])
print("Length of duplicates: ", len(duplicates))

Length of duplicates:  0


### 5. Replace "-" with spaces only in the 'Author' column, e.g. "Maria Herbert-Liew" ‚Üí "Maria Herbert Liew"

In [52]:
df['author'] = df['author'].str.replace('-', ' ', regex=False)

### 6. Create Discount Percentage Column(%):
- If IsOnSale is True, calculate discount info(discount percentage)

In [53]:
df['Discount (%)'] = 0.0

mask = (df['IsOnSale'] == True) & (df['real_price'] > 0)
df.loc[mask, 'Discount (%)'] = (
    ((df.loc[mask, 'real_price'] - df.loc[mask, 'sale_price']) / df.loc[mask, 'real_price']) * 100
).round().astype(int)


In [54]:
df.head()

Unnamed: 0,title,author,real_price,sale_price,IsOnSale,category,retrieved_at,Discount (%)
0,–ê–£–¢–ò–°–¢–û–¢ –ò –ì–£–õ–ê–ë–û–¢ –ü–ò–°–ú–û–ù–û–°–ï–¶,–†–æ–¥–∞–Ω –ê–ª –ì–∞–ª–∏–¥–∏,449.0,399.0,True,–†–æ–º–∞–Ω—Å–∞,2025-05-17,11.0
1,–ë–ï–ó–ù–ê–î–ï–ñ–ù–û,–ö–æ–ª–∏–Ω –•—É–≤–µ—Ä,450.0,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0
2,–í–ò–†–¢–£–û–ó,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,449.0,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0
3,–ì–û–°–ü–û–É–ê –ê–¢–ê–¢–£–†–ö –ü—Ä–≤–∞—Ç–∞ –¥–∞–º–∞ –Ω–∞ –º–æ–¥–µ—Ä–Ω–∞ –¢—É—Ä—Ü–∏—ò–∞,–ò–ø–µ–∫ –ß–∞–ª–∏—à–ª–∞—Ä,799.0,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0
4,–î–ê–í–ï–ù–ò–ö,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,559.0,,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0


In [55]:
df.loc[df['sale_price'].isna(), 'sale_price'] = 0

### 7. Missing values

In [56]:
missing_percent = df.isnull().sum() / len(df) * 100
missing_percent = missing_percent.round(2)
missing_percent = missing_percent.reset_index().rename(columns={"index": "column", 0: "percent missing"})
missing_percent

Unnamed: 0,column,percent missing
0,title,0.0
1,author,13.82
2,real_price,0.0
3,sale_price,0.0
4,IsOnSale,0.0
5,category,0.0
6,retrieved_at,0.0
7,Discount (%),0.0


In [57]:
total_rows = len(df)
missing_authors = df['author'].isna().sum()
non_missing_authors = total_rows - missing_authors

print(f"\n'Author' column:")
print(f"Non-missing values: {non_missing_authors}")
print(f"Missing values: {missing_authors}")
print(f"Percentage missing: {round((missing_authors / total_rows) * 100, 2)}%")



'Author' column:
Non-missing values: 948
Missing values: 152
Percentage missing: 13.82%


In [58]:
df.to_csv("../data/preprocessed_datasets/ikona_books.csv", encoding='utf-8-sig', index=False)

### Preprocessing Output Summary

Number of Books Scraped

In [59]:
df.shape[0]

1100

Number of categories

In [60]:
total_categories = df['category'].nunique()
print(total_categories)

13


Number of Discounted Books

In [61]:
num_distinct_on_sale = df[df['IsOnSale'] == True]['title'].nunique()
print(num_distinct_on_sale)

32


In [62]:
df.head()

Unnamed: 0,title,author,real_price,sale_price,IsOnSale,category,retrieved_at,Discount (%)
0,–ê–£–¢–ò–°–¢–û–¢ –ò –ì–£–õ–ê–ë–û–¢ –ü–ò–°–ú–û–ù–û–°–ï–¶,–†–æ–¥–∞–Ω –ê–ª –ì–∞–ª–∏–¥–∏,449.0,399.0,True,–†–æ–º–∞–Ω—Å–∞,2025-05-17,11.0
1,–ë–ï–ó–ù–ê–î–ï–ñ–ù–û,–ö–æ–ª–∏–Ω –•—É–≤–µ—Ä,450.0,0.0,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0
2,–í–ò–†–¢–£–û–ó,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,449.0,0.0,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0
3,–ì–û–°–ü–û–É–ê –ê–¢–ê–¢–£–†–ö –ü—Ä–≤–∞—Ç–∞ –¥–∞–º–∞ –Ω–∞ –º–æ–¥–µ—Ä–Ω–∞ –¢—É—Ä—Ü–∏—ò–∞,–ò–ø–µ–∫ –ß–∞–ª–∏—à–ª–∞—Ä,799.0,0.0,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0
4,–î–ê–í–ï–ù–ò–ö,–ú–∞—Ä–≥—Ä–∏—Ç –¥–µ –ú–æ—Ä,559.0,0.0,False,–†–æ–º–∞–Ω—Å–∞,2025-05-17,0.0
