In [15]:
import pandas as pd
import chardet

# STEP 1: Detect encoding
with open("RawData.csv", "rb") as f:
    raw_data = f.read(50000)
    result = chardet.detect(raw_data)
    detected_encoding = result['encoding']
    print(f"✅ Detected encoding: {detected_encoding}")

# STEP 2: Load dataset using detected encoding
df = pd.read_csv("RawData.csv", encoding=detected_encoding, on_bad_lines='skip')

# STEP 3: Ensure required columns exist
required_columns = ['ItemNo.', 'Author', 'Title', 'Edition', 'Imprint', 'Date', 'Call No.', 'ISBN']
for col in required_columns:
    if col not in df.columns:
        df[col] = ''

df = df[required_columns]

# STEP 4: Drop rows missing essential info
df.dropna(subset=['Title', 'Author'], inplace=True)

# STEP 5: Fill missing optional fields
df.fillna('', inplace=True)

# STEP 6: Remove duplicates
df.drop_duplicates(subset=['ItemNo.', 'Title', 'Author'], inplace=True)

# STEP 7: Save cleaned file
df.to_csv("cleaning_book_data.csv", index=False, encoding=detected_encoding)
print("✅ Cleaned dataset saved as 'cleaning_book_data.csv' with original language and formatting preserved.")


✅ Detected encoding: UTF-8-SIG


  df = pd.read_csv("RawData.csv", encoding=detected_encoding, on_bad_lines='skip')
  df.fillna('', inplace=True)


✅ Cleaned dataset saved as 'cleaning_book_data.csv' with original language and formatting preserved.
