In [None]:
# Measuring Data Completeness: Missing Data Rates and Handling

import pandas as pd
import numpy as np

# Load the dataset
# Replace 'your_dataset.csv' with the path to your dataset
df = pd.read_csv('your_dataset.csv')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Total number of rows and columns
total_rows, total_columns = df.shape
print(f"\nTotal rows: {total_rows}")
print(f"Total columns: {total_columns}")

# Calculate the number of missing values per column
missing_values_count = df.isnull().sum()

# Calculate the percentage of missing values per column
missing_values_percent = (missing_values_count / total_rows) * 100

# Create a DataFrame to display missing data statistics
missing_data = pd.DataFrame({
    'Missing Values': missing_values_count,
    'Percentage (%)': missing_values_percent
})

# Display columns with missing data
print("\nMissing data per column:")
print(missing_data[missing_data['Missing Values'] > 0])

# Handling missing data

# Option 1: Drop columns with more than 50% missing values
threshold = 50.0
columns_to_drop = missing_data[missing_data['Percentage (%)'] > threshold].index
df_dropped_columns = df.drop(columns=columns_to_drop)
print(f"\nDropped columns with more than {threshold}% missing values: {list(columns_to_drop)}")

# Option 2: Drop rows with missing values
df_dropped_rows = df.dropna()
print(f"\nNumber of rows after dropping rows with missing values: {df_dropped_rows.shape[0]}")

# Option 3: Fill missing values with mean (for numeric columns)
df_filled_mean = df.copy()
numeric_columns = df_filled_mean.select_dtypes(include=[np.number]).columns
df_filled_mean[numeric_columns] = df_filled_mean[numeric_columns].fillna(df_filled_mean[numeric_columns].mean())
print("\nFilled missing numeric values with column means.")

# Option 4: Fill missing values with mode (for categorical columns)
df_filled_mode = df.copy()
categorical_columns = df_filled_mode.select_dtypes(include=['object']).columns
for column in categorical_columns:
    mode_value = df_filled_mode[column].mode()
    if not mode_value.empty:
        df_filled_mode[column].fillna(mode_value[0], inplace=True)
print("Filled missing categorical values with column modes.")

# Option 5: Interpolate missing values (for numeric columns)
df_interpolated = df.copy()
df_interpolated[numeric_columns] = df_interpolated[numeric_columns].interpolate(method='linear')
print("Interpolated missing numeric values using linear method.")

# Save the cleaned datasets to CSV files
df_dropped_columns.to_csv('dataset_dropped_columns.csv', index=False)
df_dropped_rows.to_csv('dataset_dropped_rows.csv', index=False)
df_filled_mean.to_csv('dataset_filled_mean.csv', index=False)
df_filled_mode.to_csv('dataset_filled_mode.csv', index=False)
df_interpolated.to_csv('dataset_interpolated.csv', index=False)

print("\nCleaned datasets have been saved to CSV files.")