In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('dataframe.csv')

# Define the selected features
selected_features = ['T (K)', 'P (MPa)', 'methane', 'ethane', 'propane',
                     'ibutane', 'ipentane', 'hexane', 'heptane',
                     'helium',
                     'oxygen', 'H2S', 'CO2', 'z']

# Set the style of the plots (optional but adds visual appeal)

# Set a threshold for the number of missing values to consider a row for removal
threshold_to_remove = len(selected_features) * 0.2  # 20% missing values

# Define a function to fill missing values with the mean of the last 20 or less than 20 data points
def fill_missing_with_last_n_mean(df, features, n=20):
    for feature in features:
        for i in range(len(df)):
            if df[feature].isnull().iloc[i]:
                last_n_values = df[feature].iloc[max(i - n + 1, 0):i].dropna()
                if not last_n_values.empty:
                    df.at[i, feature] = last_n_values.mean()

# Fill missing data in rows with over 20% missing values using the mean of the last 20 values
fill_missing_with_last_n_mean(data, selected_features, n=20)

# Remove rows with too many missing values
data = data.dropna(thresh=threshold_to_remove)

# Define a function to remove outliers using the IQR method
def remove_outliers(df, features, multiplier=1.5):
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
    return df

# Specify the multiplier to control the outlier removal (adjust as needed)
outlier_multiplier = 1.5

# Remove outliers from selected features
data = remove_outliers(data, selected_features)

# Set the style of the plots (optional but adds visual appeal)
sns.set(style="whitegrid")

# Export the cleaned dataframe to a CSV file
data.to_csv('cleaned_main_model_data.csv', index=False)

# Now, you have a cleaned and processed dataframe saved 
# in "cleaned_model_data.csv" with missing data filled using
# the mean of the last 20 or less available data points in 
# the columns with missing values.
