In [1124]:
# LIBRARIES
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [1125]:
def read_data(file_name, header=0, sep=','):
    """
    Reads a csv file and returns a pandas dataframe
    """
    return pd.read_csv(file_name, sep=sep, header=header)

In [1126]:
def try_parse_float(value):
    """
    Try to parse a string as a float
    """
    try:
        float(value)
    except:
        return False
    return True

In [1127]:
def detect_correct_datatype(df, column):
    float_count = df[column].apply(lambda x: try_parse_float(x)).sum() - df[column].isna().sum()
    percentage_float = ((df.shape[0] - float_count) / df.shape[0]) * 100
    if percentage_float <= 25:
        df[column] = pd.to_numeric(df[column], errors='coerce')

In [1128]:
def convert_type(val, type):
    """
    Convert a value to a given type
    """
    try:
        if type == 'int64':
            return np.int64(val)
        elif type == 'float64':
            return np.float64(val)
        elif type == 'uint8':
            return np.uint8(val)
    except:
        return np.nan

In [1129]:
def is_numeric_or_categorical(df, column):
    unique_values = df[column].unique()
    if ((df.shape[0] - len(unique_values)) / df.shape[0]) * 100 < 93:
        return True, ((df.shape[0] - len(unique_values)) / df.shape[0]) * 100
    return False, ((df.shape[0] - len(unique_values)) / df.shape[0]) * 100

In [1130]:
def remove_columns(df, columns):
    """
    Removes columns from a dataframe
    """
    df.drop(columns, axis=1, inplace=True)

In [1131]:
def remove_rows(df, rows):
    """
    Removes rows from a dataframe
    """
    df.drop(rows, axis=0, inplace=True)

In [1132]:
def drop_rows_condition(df, conition):
    return df[~conition]

In [1133]:
def remove_duplicates(df):
    """
    Removes duplicates from a dataframe
    """
    df.drop_duplicates(inplace=True)

In [1134]:
def convert_to_datatype(df, column, datatype):
    """
    Converts a column to a datatype
    """
    if datatype == 'object':
        df[column] = df[column].astype(datatype)
    else:
        df[column] = df[column].apply(lambda x: convert_type(x, datatype))

In [1135]:
def remove_outlier_numeric(df, column, min=-np.inf, max=np.inf):
    """
    Removes rows from a dataframe based on a condition
    """
    if min != -np.inf and max != np.inf:
        return df[(df[column] >= min) & (df[column] <= max)]
    elif min == -np.inf and max != np.inf:
        return df[(df[column] <= max)]
    elif min != -np.inf and max == np.inf:
        return df[(df[column] >= min)]

In [1136]:
def detect_outliers_std(df, column, std=3):
    """
    Detect outliers based on standard deviation
    """
    
    # Set upper and lower limit to 3 standard deviation
    random_data_std = np.std(df[column])
    random_data_mean = np.mean(df[column])
    anomaly_cut_off = random_data_std * std
    
    lower_limit  = random_data_mean - anomaly_cut_off 
    upper_limit = random_data_mean + anomaly_cut_off

    # Generate outliers
    outliers = (df[column] < lower_limit) | (df[column] > upper_limit)
    return outliers

In [1137]:
def detect_outliers_isolation_forest(df, column, contamination=0.1):
    """
    Detect outliers based on isolation forest
    """
    # Create isolation forest
    clf = IsolationForest(random_state=0, contamination=contamination)
    predictions = clf.fit_predict(df[column].to_numpy().reshape(-1, 1))   
    return predictions == -1

In [1138]:
def correct_category_levenshtein(df, column, correct_categories, threshold=80):
    """
    Corrects a column by using fuzzywuzzy to find the correct category
    """
    inconsistent_categories = pd.array(list(set(df[column].unique()) - set(correct_categories)))
    
    for inconsistent_category in inconsistent_categories:
        potential_match = process.extractOne(inconsistent_category, correct_categories)
        if potential_match[1] > threshold:
            df.loc[df[column] == inconsistent_category, column] = potential_match[0]
    return df

In [1139]:
def convert_nominal_categories(df, columns):
    """
    Converts categorical data to numeric data
    """
    return pd.get_dummies(df, columns=columns)

In [1140]:
def convert_ordinal_category(df, column, order):
    df[column].replace(to_replace=df[column].unique(), value=order, inplace=True)

In [1141]:
def fill_missing_values(df, column, value):
    """
    Fills missing values in a column with a value
    """
    df[column].fillna(value, inplace=True)

In [1142]:
def drop_missing_values(df, column):
    """
    Drops missing values in a column
    """
    df.dropna(subset=[column], inplace=True)

In [1143]:
def fill_average_mode(df, column, is_numeric):
    """
    Fills missing values in a column with the average or mode
    """
    if not is_numeric:
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        df[column].fillna(df[column].mean(), inplace=True)

In [1144]:
def knn_impute(df, column, is_numeric):
    """
    Imputation using KNN
    """
    x_train = df[~df[column].isna()].copy()
    x_train.dropna(inplace=True)
    
    y_train = x_train[column]
    x_train = x_train[x_train.columns[x_train.columns != column]]
    x_train = x_train[x_train.columns[x_train.dtypes != 'object']]
     
        
    x_predict = df[df[column].isna()][df.columns[df.columns != column]].copy()
    x_predict = x_predict[x_predict.columns[x_predict.dtypes != 'object']]
    
    if x_predict.shape[0] == 0:
        return
    
    if is_numeric:
        # REGRESSION
        knn_regressor = KNeighborsRegressor()
        knn_regressor.fit(x_train, y_train)
        y_predict = knn_regressor.predict(x_predict)
        df.loc[df[column].isna(), column] = y_predict
    else:
        # CLASSIFICATION
        knn_classifier = KNeighborsClassifier()
        knn_classifier.fit(x_train, y_train)
        y_predict = knn_classifier.predict(x_predict)
        df.loc[df[column].isna(), column] = y_predict

In [1145]:
def automatic_data_filler(df, column, output_column, is_numeric, no_corr=0.01, low_corr=0.5):
    # if df[column].isna().sum() == 0:
    #     print(column, "No missing values")
    #     return
    
    df_temp = df.copy()
    drop_missing_values(df_temp, column)
    if df_temp[column].dtype == 'object':
        convert_ordinal_category(df_temp, column, [x for x in range(len(df_temp[column].unique()))])
        
    p_score = df_temp[column].corr(df_temp[output_column], method='pearson')
    
    if p_score >= -no_corr and p_score <= no_corr:
        # NO CORRELATION
        if ((df.shape[0] - df[column].isna().sum()) / df.shape[0]) * 100 >= 50:
            # MISSING VALUES ARE TOO LARGE
            # print(column, p_score, "No correlation, missing values too large")
            remove_columns(df, [column])
            return 'column'
        else:
            # MISSING VALUES ARE SMALL
            # print(column, p_score, "No correlation, missing values small")
            drop_missing_values(df, column)
            return 'row'
    elif (p_score >= -low_corr and p_score < -no_corr) or (p_score > no_corr and p_score <= low_corr):
        # LOW CORRELATION
        # print(column, p_score, "Low correlation")
        fill_average_mode(df, column, is_numeric)
        return 'average'
    elif p_score >= -1 and p_score <= 1:
        # HIGH CORRELATION
        # print(column, p_score, "High correlation")
        knn_impute(df, column, is_numeric)
        return 'knn'

In [1146]:
def data_cleaning_suggestions(df, output_column):
    df_jsons = []
    for col in df.columns[df.columns != output_column]:
        col_json = {}
        col_json.update({"column_name": col})
        
        # Detect Datatype
        detect_correct_datatype(df, col)
        if df[col].dtype == 'object':
            col_json.update({"datatype": "object"})
        elif df[col].dtype == 'int64':
            col_json.update({"datatype": "int64"})
        elif df[col].dtype == 'float64':
            col_json.update({"datatype": "float64"})
        elif df[col].dtype == 'uint8':
            col_json.update({'datatype': 'uint8'})
        
        # Detect Numeric or Categorical
        is_numeric = False
        if df[col].dtype != 'object':
            is_numeric, _ = is_numeric_or_categorical(df, col)
        col_json.update({'is_numeric': is_numeric})
        
        print(col, df[col].dtype, is_numeric)
        # Detect Outliers
        if is_numeric:
            outliers = detect_outliers_std(df, col)
            drop_rows_condition(df, outliers)
        elif df[col].dtype != 'object':
            outliers = detect_outliers_isolation_forest(df, col)
            drop_rows_condition(df, outliers)
        elif df[col].dtype == 'object':
            df_temp = df.copy()
            convert_ordinal_category(df_temp, col, [x for x in range(len(df_temp[col].unique()))])
            outliers = detect_outliers_isolation_forest(df_temp, col)
            try:
                correct_category_levenshtein(df, col, df[~outliers].unique())
            except:
                df.loc[outliers, col] = np.nan
            
        if is_numeric:
            col_json.update({'min': df[col].min(), 'max': df[col].max(), 'mean': df[col].mean()})
            col_json.update({'unique_count': np.nan, 'unique_values': np.nan})
        else:
            col_json.update({'min': np.nan, 'max': np.nan, 'mean': np.nan})
            col_json.update({'unique_count': len(df[col].unique()), 'unique_values': df[col].unique().tolist()})
            
        # Missing Data Filler
        method = automatic_data_filler(df, col, output_column, is_numeric)
        col_json.update({"dropped": True if method == 'column' else False})
        col_json.update({'fill_method': "automatic"})
        
        if method != 'column':
            if is_numeric:
                col_json.update({'min': df[col].min(), 'max': df[col].max(), 'mean': df[col].mean()})
                col_json.update({'unique_count': np.nan, 'unique_values': np.nan})
            else:
                col_json.update({'min': np.nan, 'max': np.nan, 'mean': np.nan})
                col_json.update({'unique_count': len(df[col].unique()), 'unique_values': df[col].unique().tolist()})
            
        # completing json
        col_json.update({'is_nominal': np.nan})
        col_json.update({'ordinal_order': []})
                
        df_jsons.append(col_json)
            
    return df_jsons

In [None]:
def clean_data(df, output_column, operations):
    pass

# TESTING FUNCTIONS

In [1147]:
titanic_df = read_data('../data/raw/titanic/titanic.csv', header=0)
print(titanic_df.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [None]:
data_cleaning_suggestions(titanic_df, 'Survived')

In [None]:
# remove_columns(titanic_df, ['Cabin'])
# drop_missing_values(titanic_df, 'Embarked')
# knn_impute(titanic_df, 'Age', True)

In [None]:
print(titanic_df.dtypes)
print(titanic_df.isna().sum())
titanic_df.head()

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S


In [None]:
automatic_data_filler(titanic_df, 'PassengerId', 'Survived', True)
automatic_data_filler(titanic_df, 'Pclass', 'Survived', False)
automatic_data_filler(titanic_df, 'Name', 'Survived', False)
automatic_data_filler(titanic_df, 'Sex', 'Survived', False)
automatic_data_filler(titanic_df, 'Age', 'Survived', True)
automatic_data_filler(titanic_df, 'SibSp', 'Survived', True)
automatic_data_filler(titanic_df, 'Parch', 'Survived', False)
automatic_data_filler(titanic_df, 'Ticket', 'Survived', False)
automatic_data_filler(titanic_df, 'Fare', 'Survived', True)
automatic_data_filler(titanic_df, 'Cabin', 'Survived', False)
automatic_data_filler(titanic_df, 'Embarked', 'Survived', False)

PassengerId No missing values
Pclass No missing values
Name No missing values
Sex No missing values
Age -0.07722109457217764 Low correlation
SibSp No missing values
Parch No missing values
Ticket No missing values
Fare No missing values
Cabin 0.04578929038654076 Low correlation
Embarked 0.10866867101787406 Low correlation


In [None]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",female,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S


In [None]:
# df_temp = titanic_df.copy()
# for col in titanic_df.columns[titanic_df.columns != 'Survived']:
#     drop_missing_values(df_temp, col)
#     if df_temp[col].dtype == 'object':
#         convert_ordinal_category(df_temp, col, [x for x in range(len(df_temp[col].unique()))])
#     print(col, df_temp[col].corr(df_temp['Survived'], method='pearson'))

PassengerId -0.005006660767066487
Pclass -0.3384810359610147
Name -0.005006660767066487
Sex 0.5433513806577551
Age -0.07722109457217764
SibSp -0.017358360479534228
Parch 0.09331700774224289
Ticket -0.003852775513776319
Fare 0.26818861687447865
Cabin 0.08604457970131162
Embarked -0.10891351318273423


In [None]:
detect_outliers_isolation_forest(titanic_df, 'Pclass')

Series([], Name: Pclass, dtype: float64)


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().