# 📚 AkademskaKniga.mk - Book Dataset Cleaning & Preprocessing

In [82]:
import pandas as pd
import numpy as np

In [83]:
df= pd.read_csv('akademska_books.csv', encoding='utf-8-sig') 

In [84]:
df.head()

Unnamed: 0,Title,Author,Real Price,Sale Price,Sale,Category,Retrieved At
0,Accounting,"Warren, Carl S.",6199.0,4959 Мкд.,1,Accounting & Finance,2025-05-15
1,Pricing in General I,"Parodi, Pietro",5399.0,4319 Мкд.,1,Accounting & Finance,2025-05-15
2,Behavioral Finance i,"Kubinska, Elzbieta",9399.0,7519 Мкд.,1,Accounting & Finance,2025-05-15
3,Handbook of Alternat,-,10099.0,8079 Мкд.,1,Accounting & Finance,2025-05-15
4,Using QuickBooks (R),"Barbara), Glenn Owen",4999.0,3999 Мкд.,1,Accounting & Finance,2025-05-15


### 1. Converting the 'Sale' Column to Boolean & rename it

In [85]:
df["Sale"] = df["Sale"].astype(bool)

In [86]:
df = df.rename(columns={"Sale": "IsOnSale"})

### 2. Convert the 'Retrieved At' column to datetime format.

In [87]:
df['Retrieved At'] = pd.to_datetime(df['Retrieved At'])

### 3. Prices
- Before converting to numeric,remove "Мкд.", remove commas and strip whitespace


In [88]:
df['Sale Price'] = df['Sale Price'].str.replace("Мкд.", "", regex=False).str.replace(",", "").str.strip()

df["Real Price"] = pd.to_numeric(df["Real Price"], errors="coerce")
df["Sale Price"] = pd.to_numeric(df["Sale Price"], errors="coerce")
df['Sale Price'] = df['Sale Price'].astype(float)

### 4. Remove duplicates

In [89]:
duplicates = df[df.duplicated(keep=False)]
duplicates = duplicates.sort_values(by=['Title', 'Author'])
print("Length of duplicates: ", len(duplicates))

Length of duplicates:  5122


In [90]:
print("Original:", len(df))
df = df.drop_duplicates()
print("After drop_duplicates:", len(df))

Original: 24536
After drop_duplicates: 21393


### 4. Define a function to normalize author names  
- Inconsistent Formats:
    - Some names are written in the `"LastName, FirstName"` format.
    - Others are written in `"FirstName LastName"` or even as a comma-separated list of multiple authors.

- Normalizing names into a consistent format like `"FirstName LastName"` or `"Author1 Author2 Author3"`.

In [91]:
def normalize_author(author):
    if pd.isna(author):
        return author  
    if ',' in author:
        parts = [part.strip() for part in author.split(',')]
        return ' '.join(parts[::-1])
    return author

In [92]:
df['Author'] = df['Author'].apply(normalize_author)

In [93]:
df.head()

Unnamed: 0,Title,Author,Real Price,Sale Price,IsOnSale,Category,Retrieved At
0,Accounting,Carl S. Warren,6199.0,4959.0,True,Accounting & Finance,2025-05-15
1,Pricing in General I,Pietro Parodi,5399.0,4319.0,True,Accounting & Finance,2025-05-15
2,Behavioral Finance i,Elzbieta Kubinska,9399.0,7519.0,True,Accounting & Finance,2025-05-15
3,Handbook of Alternat,-,10099.0,8079.0,True,Accounting & Finance,2025-05-15
4,Using QuickBooks (R),Glenn Owen Barbara),4999.0,3999.0,True,Accounting & Finance,2025-05-15


### 5. Replace '?' in author with null

In [94]:
df.loc[df['Author'].str.match(r'^[\?=]+$', na=False), 'Author'] = np.nan

### 6. Replace '-' in author with null

In [95]:
df.loc[df['Author'] == '-', 'Author'] = np.nan

### 7. Replace "-" with spaces only in the 'Author' column, e.g. "Maria Herbert-Liew" → "Maria Herbert Liew"

In [96]:
df['Author'] = df['Author'].str.replace('-', ' ', regex=False)

### 8. Remove numbers from the 'Author' column,e.g. '1854 19 Oscar Wilde'

In [97]:
df['Author'] = df['Author'].str.replace(r'\d+', '', regex=True).str.strip().str.replace(r'\s+', ' ', regex=True)

### 9. Remove all parentheses 
- Remove all parentheses  and their content, matched or unmatched, including the parentheses themselves.
- This simplifies your data, especially if the content inside parentheses is often additional info like (editor), (illustrator), or incomplete fragments like (author,  or author).

In [98]:
mask = df['Author'].str.contains(r'[\(\)]', regex=True, na=False)

print("Before cleaning:")
df.loc[mask, ['Author']]

Before cleaning:


Unnamed: 0,Author
4,Glenn Owen Barbara)
45,Financial (author)
88,Parveen Za (editor)
143,Melan (illustrator)
173,Arse (photographer)
...,...
24489,Syuhe (illustrator)
24522,GODSS (illustrator)
24526,Tracy Yard (artist)
24527,Tomo (illustrator)


In [99]:
df.loc[mask, 'Author'] = df.loc[mask, 'Author'].str.replace(r'\([^)]*\)', '', regex=True)
df.loc[mask, 'Author'] = df.loc[mask, 'Author'].str.replace(r'[\(\)]', '', regex=True)
df.loc[mask, 'Author'] = df.loc[mask, 'Author'].str.strip()

print("\nAfter cleaning:")
df.loc[mask, ['Author']]


After cleaning:


Unnamed: 0,Author
4,Glenn Owen Barbara
45,Financial
88,Parveen Za
143,Melan
173,Arse
...,...
24489,Syuhe
24522,GODSS
24526,Tracy Yard
24527,Tomo


### 10. Create Discount Percentage Column(%):
- If IsOnSale is True, calculate discount info(discount percentage)

In [104]:
df['Discount (%)'] = 0.0

mask = (df['IsOnSale'] == True) & (df['Real Price'] > 0)
df.loc[mask, 'Discount (%)'] = (
    ((df.loc[mask, 'Real Price'] - df.loc[mask, 'Sale Price']) / df.loc[mask, 'Real Price']) * 100
).round().astype(int)


In [105]:
df.head()

Unnamed: 0,Title,Author,Real Price,Sale Price,IsOnSale,Category,Retrieved At,Discount (%)
0,Accounting,Carl S. Warren,6199.0,4959.0,True,Accounting & Finance,2025-05-15,20.0
1,Pricing in General I,Pietro Parodi,5399.0,4319.0,True,Accounting & Finance,2025-05-15,20.0
2,Behavioral Finance i,Elzbieta Kubinska,9399.0,7519.0,True,Accounting & Finance,2025-05-15,20.0
3,Handbook of Alternat,,10099.0,8079.0,True,Accounting & Finance,2025-05-15,20.0
4,Using QuickBooks (R),Glenn Owen Barbara,4999.0,3999.0,True,Accounting & Finance,2025-05-15,20.0


### 11. Missing values

In [106]:
missing_percent = df.isnull().sum() / len(df) * 100
missing_percent = missing_percent.round(2)
missing_percent = missing_percent.reset_index().rename(columns={"index": "column", 0: "percent missing"})
missing_percent

Unnamed: 0,column,percent missing
0,Title,0.0
1,Author,1.22
2,Real Price,0.0
3,Sale Price,0.0
4,IsOnSale,0.0
5,Category,0.0
6,Retrieved At,0.0
7,Discount (%),0.0


In [107]:
total_rows = len(df)
missing_authors = df['Author'].isna().sum()
non_missing_authors = total_rows - missing_authors

print(f"\n'Author' column:")
print(f"Non-missing values: {non_missing_authors}")
print(f"Missing values: {missing_authors}")
print(f"Percentage missing: {round((missing_authors / total_rows) * 100, 2)}%")



'Author' column:
Non-missing values: 21131
Missing values: 262
Percentage missing: 1.22%


In [None]:
df.to_csv("akademska_books.csv", encoding='utf-8-sig', index=False)

### Preprocessing Output Summary

Number of Books Scraped

In [108]:
df.shape[0]

21393

Number of categories

In [109]:
total_categories = df['Category'].nunique()
print(total_categories)

37


Number of Discounted Books

In [110]:
num_distinct_on_sale = df[df['IsOnSale'] == True]['Title'].nunique()
print(num_distinct_on_sale)

19228
