In [1]:
import os
import pandas as pd

DATASET_DIR = 'dataset'

### 1. Loading Dataset

In [2]:
try:
    df = pd.read_csv(os.path.join(DATASET_DIR, 'books.csv'))
    print("Dataset loaded Successfully.")
except FileExistsError:
    print("Dataset not found. Please ensure the dataset is in the correct directory.")
    raise

print(f"\nTotal Number of books (rows): {len(df)}")
print("-"*40)

Dataset loaded Successfully.

Total Number of books (rows): 6810
----------------------------------------


### 2. Selecting Data

In [3]:

features = ['title', 'authors', 'categories', 'description']
df_features = df[features].copy()

print("\nSelected Features: ")
print(df_features.head(3))
print("-"*170)


Selected Features: 
          title                          authors  \
0        Gilead               Marilynne Robinson   
1  Spider's Web  Charles Osborne;Agatha Christie   
2  The One Tree             Stephen R. Donaldson   

                      categories  \
0                        Fiction   
1  Detective and mystery stories   
2               American fiction   

                                         description  
0  A NOVEL THAT READERS and critics have been eag...  
1  A new 'Christie for Christmas' -- a full-lengt...  
2  Volume Two of Stephen Donaldson's acclaimed se...  
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [4]:
print("\nMissing Data in Each Feature: ")
print(df_features.isnull().sum())
print("-"*50)

# fill missing values with empty strings 

for feature in features:
    df_features[feature] = df_features[feature].fillna(' ')

print("\n Handled Data:")
print(df_features.isnull().sum())
print("-"*50)


Missing Data in Each Feature: 
title            0
authors         72
categories      99
description    262
dtype: int64
--------------------------------------------------

 Handled Data:
title          0
authors        0
categories     0
description    0
dtype: int64
--------------------------------------------------


In [5]:
def combine_text_weight(row):
    """
    Combine text features and weights them by repeating the text,
    giving authors categories more influence in the TD-IDF score.
    """

    categories_weight = (str(row['categories']) + ' ') * 4 # repeat categories 4 times
    authors_weight = (str(row['authors']) + ' ') * 2 # repeat authors 2 times
    title_weight = (str(row['title']) + ' ') * 2 # repeat title 2 times
    description_weight = str(row['description']) # description repeat only ones

    return(title_weight + authors_weight + categories_weight + description_weight).lower().strip()
print("\nCombined Text Features Successfully.")

# apply the function to create a new combined feature
df_features['combined_features'] = df_features.apply(combine_text_weight, axis=1)

print("\nExample of the combined text features (only first book): ")
print(df_features['combined_features'].iloc[0][:500]+ "...") # print first 500 characters
print("-"*170)



Combined Text Features Successfully.

Example of the combined text features (only first book): 
gilead gilead marilynne robinson marilynne robinson fiction fiction fiction fiction a novel that readers and critics have been eagerly anticipating for over a decade, gilead is an astonishingly imagined story of remarkable lives. john ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. it’s 1956 in gilead, iowa, towards the end of the reverend ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will neve...
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------


### 5. save the clean data

In [6]:
output_filename = os.path.join(DATASET_DIR, "books_cleaned.csv")
if 'isbn13' in df.columns:
    df_features['isbn13'] = df['isbn13'] # add isbn13
    df_features[['isbn13', 'title', 'combined_features']].to_csv(output_filename, index=False)
    print(f"\nCleaned data saved to {output_filename}")
else:
    print("Error: 'isbn13' column not found in the original dataset")


Cleaned data saved to dataset\books_cleaned.csv
