In [2]:
import pandas as pd
import numpy as np

d = pd.read_csv("heart_failure_clinical_records_dataset.csv")

def optimize_memory(df):
    """
    Optimize memory usage of a pandas DataFrame by downcasting numeric data types.

    Parameters:
    -----------
    df : pandas.DataFrame
    Returns:
    --------
    pandas.DataFrame
        A memory-optimized copy of the input DataFrame
    """
    # Create a copy of the dataframe to avoid modifying the original
    result = df.copy()

    # Memory usage before optimization
    start_memory = result.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage before optimization: {start_memory:.2f} MB")

    # Optimize numeric columns
    for col in result.columns:
        col_type = result[col].dtype

        # Process numerical columns
        if pd.api.types.is_numeric_dtype(col_type):

            # Integers
            if pd.api.types.is_integer_dtype(col_type):
                # Get min and max values to determine the smallest possible type
                c_min = result[col].min()
                c_max = result[col].max()

                # Determine best integer type based on min and max values
                if c_min >= 0:  # For unsigned integers
                    if c_max < 255:
                        result[col] = result[col].astype(np.uint8)
                    elif c_max < 65535:
                        result[col] = result[col].astype(np.uint16)
                    elif c_max < 4294967295:
                        result[col] = result[col].astype(np.uint32)
                    else:
                        result[col] = result[col].astype(np.uint64)
                else:  # For signed integers
                    if c_min > -128 and c_max < 127:
                        result[col] = result[col].astype(np.int8)
                    elif c_min > -32768 and c_max < 32767:
                        result[col] = result[col].astype(np.int16)
                    elif c_min > -2147483648 and c_max < 2147483647:
                        result[col] = result[col].astype(np.int32)
                    else:
                        result[col] = result[col].astype(np.int64)

            # Floats
            elif pd.api.types.is_float_dtype(col_type):
                # Downcast to float32 if possible
                c_min = result[col].min()
                c_max = result[col].max()

                # Check if float32 range is sufficient
                # (approximate range: -3.4e38 to 3.4e38)
                if c_min > -3.4e38 and c_max < 3.4e38:
                    result[col] = result[col].astype(np.float32)
                else:
                    result[col] = result[col].astype(np.float64)

        # For object columns, convert to category if beneficial
        elif col_type == 'object':
            # Calculate the ratio of unique values to total values
            unique_ratio = result[col].nunique() / len(result)

            # If the ratio is small, it's beneficial to use categorical
            if unique_ratio < 0.5:  # This threshold can be adjusted
                result[col] = result[col].astype('category')

    # Memory usage after optimization
    end_memory = result.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization: {end_memory:.2f} MB")
    print(f"Memory reduced by: {100 * (start_memory - end_memory) / start_memory:.2f}%")

    return result
optimize_memory(d)

Memory usage before optimization: 0.03 MB
Memory usage after optimization: 0.01 MB
Memory reduced by: 76.60%


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00000,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03125,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00000,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00000,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00000,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00000,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00000,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00000,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00000,1.4,140,1,1,280,0
