In [1]:
import pandas as pd
from outlier_null_remover import preprocess_dataset

In [2]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [4]:
def preprocess_dataset(dataframe, cols, handle_missing_values=True, 
                      missing_value_cols=None, missing_value_method='mean', 
                      handle_outliers=True, outlier_cols=None, 
                      outlier_removal_method='iqr', 
                      numerical_cols=None, categorical_cols=None):
    
    # Check if missing_value_cols and outlier_cols are provided
    if missing_value_cols is None:
        missing_value_cols = []
    if outlier_cols is None:
        outlier_cols = []

    # Handling missing values
    if handle_missing_values:
        for col in missing_value_cols:
            if col in dataframe.columns:
                if col in numerical_cols:
                    if missing_value_method == 'mean':
                        dataframe[col].fillna(dataframe[col].mean(), inplace=True)
                    elif missing_value_method == 'median':
                        dataframe[col].fillna(dataframe[col].median(), inplace=True)
                elif col in categorical_cols:
                    # Use mode to fill missing values for categorical columns
                    mode_value = dataframe[col].mode()[0]
                    dataframe[col].fillna(mode_value, inplace=True)

    # Handling outliers
    if handle_outliers:
        for col in outlier_cols:
            if col in dataframe.columns:
                if outlier_removal_method == 'iqr':
                    q1 = dataframe[col].quantile(0.25)
                    q3 = dataframe[col].quantile(0.75)
                    iqr = q3 - q1
                    lower_bound = q1 - 1.5 * iqr
                    upper_bound = q3 + 1.5 * iqr
                    dataframe = dataframe[(dataframe[col] >= lower_bound) & (dataframe[col] <= upper_bound)]
                elif outlier_removal_method == 'z_score':
                    z_scores = (dataframe[col] - dataframe[col].mean()) / dataframe[col].std()
                    dataframe = dataframe[(z_scores.abs() <= 3)]  # Keep only rows within 3 standard deviations

    return dataframe


In [5]:
df1 = preprocess_dataset(df , cols=['age','fare','embarked'],missing_value_cols= ['age',"embarked"] , 
                   outlier_cols= ['fare'] , numerical_cols= ['age'] , categorical_cols= ['embarked']
                  )

In [6]:
df1

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
5,1,1,"Anderson, Mr. Harry",male,48.000000,0,0,19952,26.5500,E12,S,3,,"New York, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.000000,0,0,112050,0.0000,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.000000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.000000,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"
14,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.000000,0,0,27042,30.0000,A23,S,B,,"Hessle, Yorks"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.500000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,29.881138,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.500000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.000000,0,0,2670,7.2250,,C,,,
