# Data Cleaning and Preprocessing Pipeline
This notebook implements data cleaning for the scraped books data.

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
df = pd.read_csv('../data_collection/books_data.csv')
print(f"Original dataset shape: {df.shape}")
df.head()

Original dataset shape: (1000, 4)


Unnamed: 0,title,price,rating,availability
0,A Light in the Attic,51.77,3,In stock
1,Tipping the Velvet,53.74,1,In stock
2,Soumission,50.1,1,In stock
3,Sharp Objects,47.82,4,In stock
4,Sapiens: A Brief History of Humankind,54.23,5,In stock


In [5]:
print(f"Number of duplicates: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Dataset shape after removing duplicates: {df.shape}")

Number of duplicates: 0
Dataset shape after removing duplicates: (1000, 4)


In [6]:
print("Data types before standardization:")
print(df.dtypes)
print("\nColumn names:")
print(df.columns.tolist())

Data types before standardization:
title            object
price           float64
rating            int64
availability     object
dtype: object

Column names:
['title', 'price', 'rating', 'availability']


In [7]:
for col in df.columns:
    if 'price' in col.lower():
        df[col] = pd.to_numeric(df[col], errors='coerce')
    elif 'rating' in col.lower():
        df[col] = pd.to_numeric(df[col], errors='coerce')
    elif 'date' in col.lower() or 'published' in col.lower():
        df[col] = pd.to_datetime(df[col], errors='coerce')

print("Data types after standardization:")
print(df.dtypes)

Data types after standardization:
title            object
price           float64
rating            int64
availability     object
dtype: object


In [8]:
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

Missing values per column:
Series([], dtype: int64)


In [9]:
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(df[col].median(), inplace=True)
    elif df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown', inplace=True)

print("Missing values after imputation:")
print(df.isnull().sum().sum())

Missing values after imputation:
0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [10]:
text_columns = df.select_dtypes(include=['object']).columns

for col in text_columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str)
        df[col] = df[col].str.lower()
        df[col] = df[col].str.replace('[^a-zA-Z0-9\s]', '', regex=True)
        df[col] = df[col].str.strip()

print("Text preprocessing completed")

Text preprocessing completed


  df[col] = df[col].str.replace('[^a-zA-Z0-9\s]', '', regex=True)


In [11]:
validation_issues = []

for col in df.columns:
    if 'price' in col.lower():
        negative_prices = df[df[col] < 0]
        if len(negative_prices) > 0:
            validation_issues.append(f"Found {len(negative_prices)} negative prices in {col}")
            df[col] = df[col].abs()

    elif 'rating' in col.lower():
        invalid_ratings = df[(df[col] < 1) | (df[col] > 5)]
        if len(invalid_ratings) > 0:
            validation_issues.append(f"Found {len(invalid_ratings)} invalid ratings in {col}")
            df[col] = df[col].clip(1, 5)

if validation_issues:
    print("Validation issues found and corrected:")
    for issue in validation_issues:
        print(f"- {issue}")
else:
    print("No validation issues found")

No validation issues found


In [12]:
print("Final dataset summary:")
print(f"Shape: {df.shape}")
print(f"Data types:\n{df.dtypes}")
print(f"Missing values: {df.isnull().sum().sum()}")
df.head()


Final dataset summary:
Shape: (1000, 4)
Data types:
title            object
price           float64
rating            int64
availability     object
dtype: object
Missing values: 0


Unnamed: 0,title,price,rating,availability
0,a light in the attic,51.77,3,in stock
1,tipping the velvet,53.74,1,in stock
2,soumission,50.1,1,in stock
3,sharp objects,47.82,4,in stock
4,sapiens a brief history of humankind,54.23,5,in stock
