In [7]:
import pandas as pd

# Step 1: Load dataset with error handling
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        print("✅ Dataset loaded successfully.")
        return df
    except FileNotFoundError:
        print(f"❌ Error: File '{file_path}' not found.")
    except pd.errors.EmptyDataError:
        print("❌ Error: File is empty.")
    except pd.errors.ParserError:
        print("❌ Error: File is corrupted or not properly formatted.")
    return None

# Step 2: Remove duplicates based on specific columns (if any), or entire row
def remove_duplicates(df, subset_columns=None):
    initial_shape = df.shape
    df_cleaned = df.drop_duplicates(subset=subset_columns)
    print(f"🧹 Removed {initial_shape[0] - df_cleaned.shape[0]} duplicate rows.")
    return df_cleaned

# Step 3: Fix data types with error handling
def fix_data_types(df, conversions):
    for col, dtype in conversions.items():
        try:
            if dtype == 'datetime':
                df[col] = pd.to_datetime(df[col])
            else:
                df[col] = df[col].astype(dtype)
            print(f"✅ Converted column '{col}' to {dtype}.")
        except Exception as e:
            print(f"❌ Failed to convert column '{col}' to {dtype}. Error: {e}")
    return df

# Step 4: Save cleaned data
def save_cleaned_data(df, output_path):
    try:
        df.to_csv(output_path, index=False)
        print(f"💾 Cleaned data saved to '{output_path}'.")
    except Exception as e:
        print(f"❌ Failed to save cleaned data. Error: {e}")

# --- MAIN EXECUTION ---

# Define the input/output files
input_file = 'samp_data.csv'
output_file = 'cleaned_data_Q2.csv'

# Define data type conversions as a dictionary
# Example: {'age': 'int', 'date_joined': 'datetime', 'customer_id': 'str'}
conversions = {
    # Add your actual columns here
    # 'age': 'int',
    # 'date_joined': 'datetime',
    # 'customer_id': 'str'
}

# Define subset columns for duplicate detection (optional)
# subset_columns = ['id', 'email']  # Customize based on your dataset
subset_columns = None  # Remove based on entire row

# Load, clean, convert and save data
df = load_data(input_file)

if df is not None:
    print("\n🔍 Preview of raw data:")
    print(df.head())

    df = remove_duplicates(df, subset_columns)
    df = fix_data_types(df, conversions)
    
    print("\n✅ Final data types:")
    print(df.dtypes)

    save_cleaned_data(df, output_file)


✅ Dataset loaded successfully.

🔍 Preview of raw data:
     A         B    C    D
0  1.0       cat  1.1  NaN
1  2.0       dog  NaN  NaN
2  NaN    rabbit  3.5  3.2
3  4.0       NaN  4.0  4.1
4  5.0  elephant  NaN  5.0
🧹 Removed 0 duplicate rows.

✅ Final data types:
A    float64
B     object
C    float64
D    float64
dtype: object
💾 Cleaned data saved to 'cleaned_data_Q2.csv'.
