In [55]:
import pandas as pd
import os
import sys
from deep_translator import GoogleTranslator  # Using deep_translator for translating column names
from scipy.stats import zscore

def translate_column_names(df, garbage_bin):
    old_columns = df.columns.tolist()
    translated_columns = [GoogleTranslator(source='auto', target='en').translate(col) for col in old_columns]

    # Store old column names in the garbage bin
    old_names_df = pd.DataFrame({'Old Column Names': old_columns})
    garbage_bin = pd.concat([garbage_bin, old_names_df], ignore_index=True)
    df.columns = translated_columns

    return df, garbage_bin

def remove_outliers(df, garbage_bin, z_thresh=3):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if numeric_cols.empty:
        print("No numeric columns to check for outliers.")
        return df, garbage_bin

    z_scores = df[numeric_cols].apply(zscore)
    outliers = (z_scores.abs() > z_thresh).any(axis=1)

    # Log outliers in the garbage bin
    garbage_bin = pd.concat([garbage_bin, df[outliers]], ignore_index=True)

    # Remove outliers from the DataFrame
    df = df[~outliers]

    return df, garbage_bin

def handle_invalid_values(df, garbage_bin):
    invalid_values = df.isnull().any(axis=1)

    # Log invalid values in the garbage bin
    garbage_bin = pd.concat([garbage_bin, df[invalid_values]], ignore_index=True)

    # Remove invalid values from the DataFrame
    df = df.dropna()

    return df, garbage_bin

def save_dataframe_to_csv(df, file_path):
    if df.empty:
        print("Warning: The DataFrame is empty. No file will be saved.")
        return
    try:
        df.to_csv(file_path, index=False)
        print(f"File saved successfully to {file_path}")
    except Exception as e:
        print(f"An error occurred while saving the file: {e}")

# Initialize an empty garbage bin DataFrame
garbage_bin = pd.DataFrame()

# Read the initial CSV file
try:
    df = pd.read_csv("China.csv", sep=',', low_memory=True)
    if df.empty:
        print("Warning: The input CSV file is empty.")
    else:
        print("CSV file loaded successfully.")
except FileNotFoundError:
    print("Error: The file 'China.csv' was not found.")
    sys.exit(1)
except pd.errors.EmptyDataError:
    print("Error: The file is empty.")
    sys.exit(1)
except Exception as e:
    print(f"An error occurred while reading the file: {e}")
    sys.exit(1)

# Step 1: Translate column names to English
df, garbage_bin = translate_column_names(df, garbage_bin)

# Step 2: Remove outliers and log them in the garbage bin
df, garbage_bin = remove_outliers(df, garbage_bin)

# Step 3: Handle null and invalid values, log them in the garbage bin
df, garbage_bin = handle_invalid_values(df, garbage_bin)

# Columns to drop and rename
drop_columns = ['Unnamed: 21', 'educate', 'marriage', 'Monthly salary',
                'industry', 'post code', 'address',
                'Province', 'gender', 'cell phone', 'City', 'Model']

new_columns = {
    'Frame number': 'frame_number',
    'BRAND': 'brand',
    'Car': 'car',
    'Car Series': 'car_series',
    'Configuration': 'configuration',
    'Engine No.': 'engine_number',
    'ID card': 'id_card',
    'Mail': 'email_address',
    'Name': 'name',
    'Birthday': 'date_of_birth'
}

# Handle dropped columns
dropped_columns_data = df[drop_columns]  # Extract dropped columns data
df = df.drop(columns=drop_columns, errors='ignore')  # Drop specified columns

# Log dropped columns in the garbage bin
for col in drop_columns:
    if col in dropped_columns_data.columns:
        # Instead of logging all the data in the column, log that the column was dropped
        garbage_bin = pd.concat([
            garbage_bin,
            pd.DataFrame({
                'Old Column Names': [col],
                'Dropped Column Data': [f"Column '{col}' was dropped"]
            })
        ], ignore_index=True)

# Rename columns and log old column names into the garbage bin
for old_col, new_col in new_columns.items():
    if old_col in df.columns:
        garbage_bin = pd.concat([
            garbage_bin,
            pd.DataFrame({'Old Column Names': [old_col], 'Dropped Column Data': [None]})
        ], ignore_index=True)
        df = df.rename(columns={old_col: new_col})

# Save cleaned DataFrame to a new file
cleaned_file_path = r'C:\Users\omari\OneDrive\Documents\Vault\Test\China\Clean_Data.csv'
save_dataframe_to_csv(df, cleaned_file_path)

# Save garbage bin DataFrame to a separate file
garbage_bin_file_path = r'C:\Users\omari\OneDrive\Documents\Vault\Test\China\Garbage_Bin.csv'
save_dataframe_to_csv(garbage_bin, garbage_bin_file_path)


  df = pd.read_csv("China.csv", sep=',', low_memory=True)


CSV file loaded successfully.
File saved successfully to C:\Users\omari\OneDrive\Documents\Vault\Test\China\Clean_Data.csv
File saved successfully to C:\Users\omari\OneDrive\Documents\Vault\Test\China\Garbage_Bin.csv
