In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('new_clean_data.csv')

# Define the selected features
selected_features = ['T (K)', 'P (MPa)', 'methane', 'ethane', 'propane',
                     'ibutane', 'ipentane', 'hexane', 'heptane',
                     'helium',
                     'oxygen', 'H2S', 'CO2',]

# Imputation: Replace missing values with the mean of each column
data[selected_features] = data[selected_features].fillna(data[selected_features].mean())

# Dropping Rows or Columns: Remove rows with missing values
data = data.dropna()

# Interpolation: Estimate missing values based on existing data
data[selected_features] = data[selected_features].interpolate(method='linear')

# Handling Outliers:
for feature in selected_features:
    # Trimming: Remove outliers by setting boundaries using mean ± 3 standard deviations
    mean = data[feature].mean()
    std = data[feature].std()
    lower_bound = mean - 3 * std
    upper_bound = mean + 3 * std
    data = data[(data[feature] >= lower_bound) & (data[feature] <= upper_bound)]
    
    # Capping: Cap extreme values to a predefined percentile (e.g., 95th percentile)
    percentile_95 = data[feature].quantile(0.95)
    data[feature] = data[feature].clip(upper=percentile_95)

# Set the style of the plots (optional but adds visual appeal)
    sns.set(style="whitegrid")

# Export the cleaned dataframe to a CSV file
data.to_csv('new_dataframe_model.csv', index=False)

# Now, you have a cleaned and processed dataframe saved 
# in "cleaned_main_model_data.csv" with missing data filled using
# statistical measures and outliers handled through trimming and capping.
