In [1]:
import pandas as pd
from ftfy import fix_text
from langdetect import detect

In [27]:
data = pd.read_csv('books_dataset.csv')
df = pd.DataFrame(data)
print(df)

                                                  title             author  \
0                       The Girl with the Dragon Tattoo      Stieg Larsson   
1                              And Then There Were None    Agatha Christie   
2                                       Angels & Demons       Dan    Brown   
3                                               Rebecca  Daphne du Maurier   
4                                         In Cold Blood      Truman Capote   
...                                                 ...                ...   
3205                                        Moon Palace        Paul Auster   
3206                               The Seat of the Soul         Gary Zukav   
3207  The History of Sexuality, Volume 1: An Introdu...    Michel Foucault   
3208                               A Fire Upon the Deep       Vernor Vinge   
3209                                      The Blind Owl     Sadegh Hedayat   

      rating                                            summary

In [28]:
df.shape

(3210, 7)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3210 entries, 0 to 3209
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      3190 non-null   object 
 1   author     3190 non-null   object 
 2   rating     3190 non-null   float64
 3   summary    3190 non-null   object 
 4   genres     3160 non-null   object 
 5   cover_url  3190 non-null   object 
 6   url        3210 non-null   object 
dtypes: float64(1), object(6)
memory usage: 175.7+ KB


In [30]:
df.isna().sum()

title        20
author       20
rating       20
summary      20
genres       50
cover_url    20
url           0
dtype: int64

In [31]:
def clean_name(name):
    if isinstance(name, str):  # Check if the entry is a string
        try:
            # Fix encoding issues
            name = fix_text(name)
            # Ensure it's English
            if detect(name) == 'en':
                return name
        except:
            return None  # Return None for entries that fail detection
    return None  # Return None for non-string entries

In [32]:
df['title'] = df['title'].apply(clean_name)

# Remove rows with None (non-English or invalid names)
df = df.dropna(subset=['title']).reset_index(drop=True)

In [33]:
def clean_book_data(df):
    # Make a copy of the dataframe to avoid modifying the original
    df = df.copy()
    
    # Drop any rows with null values
    df = df.dropna()
    
    # Remove duplicate rows based on all columns
    df = df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)

    text_columns = ['title', 'author', 'summary']
    for col in text_columns:
        df[col] = df[col].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii') if isinstance(x, str) else x)
        df[col] = df[col].str.strip()
    
    # Convert rating to float
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    
    # Remove rows where rating is not between 0 and 5
    df = df[df['rating'].between(0, 5)]
    
    # Clean up URLs by removing any invalid ones
    def is_valid_url(url):
        return isinstance(url, str) and url.startswith('http')

    df = df[df['url'].apply(is_valid_url) & df['cover_url'].apply(is_valid_url)]
    
    # Reset index after all the cleaning
    df = df.reset_index(drop=True)
    
    return df

In [34]:
cleaned_df = clean_book_data(df)
print(cleaned_df.head())

                             title           author  rating  \
0  The Girl with the Dragon Tattoo    Stieg Larsson    4.17   
1         And Then There Were None  Agatha Christie    4.28   
2                    In Cold Blood    Truman Capote    4.09   
3                    The Godfather       Mario Puzo    4.39   
4                 The Lovely Bones     Alice Sebold    3.86   

                                             summary  \
0  Harriet Vanger, a scion of one of Swedens weal...   
1  First, there were tena curious assortment of s...   
2  On November 15, 1959, in the small town of Hol...   
3  The Godfatherthe epic tale of crime and betray...   
4  "My name was Salmon, like the fish; first name...   

                                              genres  \
0  ['Fiction', 'Mystery', 'Thriller', 'Crime', 'M...   
1  ['Mystery', 'Fiction', 'Thriller', 'Crime', 'M...   
2  ['Nonfiction', 'Classics', 'True Crime', 'Crim...   
3  ['Fiction', 'Classics', 'Crime', 'Thriller', '...   
4  [

In [35]:
duplicates = cleaned_df[cleaned_df.duplicated(subset=['title'], keep=False)]  # Use keep=False to see all duplicates

# Display duplicate rows
if not duplicates.empty:
    print("Duplicate rows found:")
    print(duplicates)
else:
    print("No duplicate rows found.")

No duplicate rows found.


In [36]:
cleaned_df.isna().sum()

title        0
author       0
rating       0
summary      0
genres       0
cover_url    0
url          0
dtype: int64

In [37]:
cleaned_df.shape

(1968, 7)

In [38]:
cleaned_df.to_csv('cleaned_books_data.csv', index=False)