# 1. Imports

In [25]:
import pandas as pd

# 2. Load dataset

In [26]:
df = pd.read_csv('data/books.csv',encoding="utf-8-sig")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92000 entries, 0 to 91999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         91999 non-null  object 
 1   author        92000 non-null  object 
 2   desc          85772 non-null  object 
 3   genre         82359 non-null  object 
 4   img           89202 non-null  object 
 5   rating        92000 non-null  float64
 6   totalratings  92000 non-null  int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 4.9+ MB


# 3. Clean text columns from weird characters

In [27]:
text_cols = ['title','author', 'desc', 'genre']

for col in text_cols:
    # Remove non-ASCII characters and strip leading/trailing spaces
    df[col] = df[col].str.replace(r'[^\x00-\x7F]+', '', regex=True).str.strip()

# 4. Clean numeric columns

In [28]:
numeric_cols = [ 'rating', 'totalratings']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 5. Handle missing values

In [29]:
# fill numeric values with the median of each column
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# fill categorical columns with empty string
for col in text_cols:
    df[col] = df[col].fillna('')

# 6. remove duplicate genres

In [30]:
df['genre'] = df['genre'].apply(lambda x: ','.join(dict.fromkeys([g.strip() for g in x.split(',')])) if pd.notna(x) else '')

# 7. Check data after cleaning

In [31]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92000 entries, 0 to 91999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         92000 non-null  object 
 1   author        92000 non-null  object 
 2   desc          92000 non-null  object 
 3   genre         92000 non-null  object 
 4   img           89202 non-null  object 
 5   rating        92000 non-null  float64
 6   totalratings  92000 non-null  int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 4.9+ MB
None


Unnamed: 0,title,author,desc,genre,img,rating,totalratings
0,Under the Moons of Mars: New Adventures on Bar...,"John Joseph Adams,Tamora Pierce,Joe R. Lansdal...",Celebrate 100 years of John Carter of Mars wit...,"Science Fiction,Short Stories,Anthologies,Fant...",https://i.gr-assets.com/images/S/compressed.ph...,3.69,213
1,Tank Girl Collection,"Alan C. Martin,Jamie Hewlett","In Visions of Booga, we join Tank Girl and her...","Sequential Art,Comics,Graphic Novels,Graphic N...",https://i.gr-assets.com/images/S/compressed.ph...,4.18,856
2,La dou zile distan,Marin Mlaicu-Hondrari,"Vocea care rostete aceste poeme, unele ca nite...","European Literature,Romanian Literature,Poetry",https://i.gr-assets.com/images/S/compressed.ph...,4.32,25
3,Introduction to Algorithms,"Thomas H. Cormen,Charles E. Leiserson,Ronald L...",This title covers a broad range of algorithms ...,"Science,Computer Science,Programming,Algorithm...",https://i.gr-assets.com/images/S/compressed.ph...,4.34,8077
4,Hopeless Magic,Rachel Higginson,",YOU'VE BEEN LOOKING FOR SOMETHING MAGICAL... ...","Fantasy,Paranormal,Young Adult,Magic,Romance,P...",https://i.gr-assets.com/images/S/compressed.ph...,4.27,7566


# 8. Save clean data into csv file

In [32]:
df.to_csv('data/cleaned_books.csv', index=False, encoding='utf-8')