# 📚 Ikona.mk - Book Dataset Cleaning & Preprocessing

In [191]:
import pandas as pd
import numpy as np

In [192]:
df= pd.read_csv('ikona_books.csv', encoding='utf-8-sig') 

In [193]:
df.head()

Unnamed: 0,Title,Author,Real Price,Sale Price,Sale,Category,Retrieved At
0,АУТИСТОТ И ГУЛАБОТ ПИСМОНОСЕЦ,Родан Ал Галиди,449,399 ден,1,Романса,2025-05-17
1,БЕЗНАДЕЖНО,Колин Хувер,450 ден,,0,Романса,2025-05-17
2,ВИРТУОЗ,Маргрит де Мор,449 ден,,0,Романса,2025-05-17
3,ГОСПОЃА АТАТУРК Првата дама на модерна Турција,Ипек Чалишлар,799 ден,,0,Романса,2025-05-17
4,ДАВЕНИК,Маргрит де Мор,559 ден,,0,Романса,2025-05-17


### 1. Converting the 'Sale' Column to Boolean & rename it

In [194]:
df["Sale"] = df["Sale"].astype(bool)

In [195]:
df = df.rename(columns={"Sale": "IsOnSale"})

### 2. Convert the 'Retrieved At' column to datetime format.

In [196]:
df['Retrieved At'] = pd.to_datetime(df['Retrieved At'])

### 3. Prices
- Before converting to numeric,remove "ден", remove commas and strip whitespace

In [197]:
df['Real Price'] = df['Real Price'].str.replace("ден", "", regex=False).str.replace(",", "").str.strip()
df['Sale Price'] = df['Sale Price'].str.replace("ден", "", regex=False).str.replace(",", "").str.strip()

df["Real Price"] = pd.to_numeric(df["Real Price"], errors="coerce")
df["Sale Price"] = pd.to_numeric(df["Sale Price"], errors="coerce")

### 4. Remove duplicates

In [198]:
duplicates = df[df.duplicated(keep=False)]
duplicates = duplicates.sort_values(by=['Title', 'Author'])
print("Length of duplicates: ", len(duplicates))

Length of duplicates:  0


### 5. Replace "-" with spaces only in the 'Author' column, e.g. "Maria Herbert-Liew" → "Maria Herbert Liew"

In [199]:
df['Author'] = df['Author'].str.replace('-', ' ', regex=False)

### 6. Create Discount Percentage Column(%):
- If IsOnSale is True, calculate discount info(discount percentage)

In [200]:
df['Discount (%)'] = 0.0

mask = (df['IsOnSale'] == True) & (df['Real Price'] > 0)
df.loc[mask, 'Discount (%)'] = (
    ((df.loc[mask, 'Real Price'] - df.loc[mask, 'Sale Price']) / df.loc[mask, 'Real Price']) * 100
).round().astype(int)


In [201]:
df.head()

Unnamed: 0,Title,Author,Real Price,Sale Price,IsOnSale,Category,Retrieved At,Discount (%)
0,АУТИСТОТ И ГУЛАБОТ ПИСМОНОСЕЦ,Родан Ал Галиди,449.0,399.0,True,Романса,2025-05-17,11.0
1,БЕЗНАДЕЖНО,Колин Хувер,450.0,,False,Романса,2025-05-17,0.0
2,ВИРТУОЗ,Маргрит де Мор,449.0,,False,Романса,2025-05-17,0.0
3,ГОСПОЃА АТАТУРК Првата дама на модерна Турција,Ипек Чалишлар,799.0,,False,Романса,2025-05-17,0.0
4,ДАВЕНИК,Маргрит де Мор,559.0,,False,Романса,2025-05-17,0.0


In [202]:
df.loc[df['Sale Price'].isna(), 'Sale Price'] = 0

### 7. Missing values

In [203]:
missing_percent = df.isnull().sum() / len(df) * 100
missing_percent = missing_percent.round(2)
missing_percent = missing_percent.reset_index().rename(columns={"index": "column", 0: "percent missing"})
missing_percent

Unnamed: 0,column,percent missing
0,Title,0.0
1,Author,13.82
2,Real Price,0.0
3,Sale Price,0.0
4,IsOnSale,0.0
5,Category,0.0
6,Retrieved At,0.0
7,Discount (%),0.0


In [204]:
total_rows = len(df)
missing_authors = df['Author'].isna().sum()
non_missing_authors = total_rows - missing_authors

print(f"\n'Author' column:")
print(f"Non-missing values: {non_missing_authors}")
print(f"Missing values: {missing_authors}")
print(f"Percentage missing: {round((missing_authors / total_rows) * 100, 2)}%")



'Author' column:
Non-missing values: 948
Missing values: 152
Percentage missing: 13.82%


In [205]:
df.to_csv("ikona_books.csv", encoding='utf-8-sig', index=False)

### Preprocessing Output Summary

Number of Books Scraped

In [206]:
df.shape[0]

1100

Number of categories

In [207]:
total_categories = df['Category'].nunique()
print(total_categories)

13


Number of Discounted Books

In [208]:
num_distinct_on_sale = df[df['IsOnSale'] == True]['Title'].nunique()
print(num_distinct_on_sale)

32
