In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression

warnings.filterwarnings('ignore')

class PandasHandler:
    """
    Handler for basic operations on dataframe pandas
    visualization, nan inspection and repair
    LOADING OF Dataframe
    load_csv(file_path, parse_dates=True):
    def get_df(self, columns=''):
    def get_train_test_df(self, target='', features='', split_percentage=0.2):
    def split_df(self, test_size=0.2, target_col):
    
    # Investigation
    def get_col_with_nan(self):
    def display_col_with_nan(self):
    def count_categories(self):
    def get_numeric_cols(self):
    def get_text_cols(self):    
    def get_categoric_cols(self):
    def get_percStdDev_data(self, colname, limit_percStdDev=1):
    def get_outliers(self, col):
    def get_iqr_outliers(self, col):
    def get_sigma_outliers(self, col):
    def get_density_outliers(self, selected_columns=None, epsilon=1.5, min_samples=5, metric='euclidean'):
    def get_df_variety(self, X=None, only_categorical=False):

    # Repair
    def repair_nan(self, colname=None, mode='value', value=0):
    def drop_row_with_nan_val(self, threshold=1,axis=0,inplace=True):
    def drop_col(self,colnames):
    def categories_threshold(self, threshold=5, under_threshold=True):
    def factorize_categories(self, columns=None):
    def remove_outliers(self, col, method='sigma', epsilon=1.5, min_samples=5, metric='euclidean'):
    def reduce_cat(self, col, values):

    # Add utility cols
    def standard_scaling(self, col1, col2):
    def normalize_scaling(self, col1, col2):
    def add_max_scaled_col(self,colname, group_by = '', prefix='max_scaled_'):
    def add_normalized_col(self,colname, prefix='normalized_'):
    def add_standard_col(self,colname, prefix='standard_'):
    def mean(self, colname, group_by='', addCol=False):
    def max(self, colname, group_by='', addCol=False):
    def min(self, colname, group_by='', addCol=False):
    def label_encode(self,columns=None):
    def onehot_encode(self, columns=None):

    # Visualization
    def plot_line(self, y='', x=''):
    def plot_distribution_line(self, col):
    def distribution(self, cols, num_subplots_perrow=3):
    def scatter(self, colx, coly, num_subplots_perrow=3, limit_percStdDev=1):
    def correlations(self, annotation=True, limit=0):
    def correlations(self, df=None, annotation=True, limit=0):
    def boxplot_outliers(self, col):
    def pairplot_relations(self):
    def features_importance(self, X=None, y=None, n_features_to_vis=25):

    
    """
        
    def __init__(self, data=None):
        self.df = None
        if data is not None:
            self.df = pd.DataFrame(data)
            self.cols_containing_NaN = self._get_columns_with_nan()
    
        self.features = None
        self.target = None
        self.X = self.df
        self.y = None
    
    def _get_columns_with_nan(self):
        return self.df.columns[self.df.isna().any()].tolist()

    def get_X(self):
        return self.X
        
    def get_y(self):
        return self.y
        
    def set_features(self, features):
        self.features = features
        return
        
    def set_target(self, target):
        self.target = target
        self.features.pop(self.features.index(target))
        self.X = self.df
        self.y = self.X.pop(target)
        return

    def get_df_variety(self, X=None, only_categorical=False):
        """Gets the count of group by field of every field of the ds"""
        if X is None:
            X = self.df
        
        row_count = X.shape[0]
        print(f'Total rows: {row_count}')
        
        for col in X.columns:
            if X[col].dtype == 'object' or X[col].dtype == 'category':
                counts = X.groupby(col)[col].count()
                percentages = (counts / counts.sum()) * 100
                
                counts = counts.apply(np.floor).astype(int)
                percentages = percentages.apply(np.floor).astype(int)
    
                print(f'\nColonna: {col}')
                print(f'{"Nome Campo":<15} {"Conteggi":<10} {"Percentuale":<12}')
                for name, count, percent in zip(counts.index, counts, percentages):
                    print(f'{name:<15} {count:<10} {percent:<12}')        
            else:
                print(f'{"Nome Campo":<15} {"Conteggi":<10} {"Percentuale":<12}')
                counts = X[col].notnull().count()
                percent = counts/row_count
                print(f'{col:<15} {counts:<10} {percent:<12}')        
                
                    
        return

    def reduce_cat(self, X=None,  col=None, values=None):
        """Assigns Other if the value in col is not in values"""
        if X is None:
            X = self.df
        other_val = 'Other'
        if col is not None:
            X[col] = X[col].apply(lambda x: x if x in values else other_val)
        
        return
    
    def print_numbercols_notzero(self, X=None):
        """Prints all numeric cols with their count>0"""        

        if X is None:
            X = self.df
        cols = X.select_dtypes(include=['number'])

        for col in cols:
            print(f'Counts for {col}--> {X[X[col]>0][col].count()}')


    
    def isolate_features(self, X=None, features=None, target=None):
        """Isolates the features of a dataframe"""        

        if X is None:
            X = self.df
        
        if features is None:
            features = X.columns.tolist()  # Converti in lista se None
        
        if isinstance(features, str):
            features = [features]  # Trasforma in lista se è una stringa
        
        # Controllo se tutte le caratteristiche specificate esistono in X
        missing_features = [feature for feature in features if feature not in X.columns]
        if missing_features:
            raise ValueError(f"Le seguenti colonne non esistono nel DataFrame: {missing_features}")
        
        if target is not None:
            self.set_target(target)
            # Estrai il target e rimuovilo da X
            if target in X.columns:
                self.y = X.pop(target)
            else:
                raise ValueError(f"La colonna target '{target}' non esiste nel DataFrame.")
        
        self.set_features(features)
        
        # Isola solo le colonne specificate e modifica direttamente X
        for column in X.columns:
            if column not in features:
                del X[column]  # Rimuovi le colonne non specificate
        
        self.X = X  # Aggiorna self.X con il DataFrame modificato

    
    
    def get_df(self, columns=''):
        """Gets the dataframe"""
        if not columns:
            return self.df
        else:
            return self.df[columns]

    def split_df(self , target_col, test_size=0.2):
        """Splits the df in X and y for test and train"""
        X = self.df.copy()
        y = X.pop(target_col)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size, random_state=42)        
        return X_train, y_train, X_test, y_test
    

    def get_train_test_df(self, target=None, features=None, split_percentage=0.2):
        """Gets the pairs X, y for training and test"""
        if target is not None and target not in self.df.columns:
            raise ValueError(f"Target '{target}' not found in DataFrame.")
        
        train_df, test_df = train_test_split(
            self.df, 
            test_size=split_percentage, 
            random_state=42, 
        )        
        
        if target is not None:
            y_train = train_df.pop(target)
            y_test = test_df.pop(target)
        else:
            y_train = []  # o pd.Series(dtype='float')
            y_test = []   # o pd.Series(dtype='float')

        if features is None:
            features = self.df.columns.drop(target) if target else self.df.columns
            
        X_train = train_df[features]
        X_test = test_df[features]

        return X_train, y_train, X_test, y_test
        
    
    def get_col_with_nan(self, X=None):
        """Gets all column names with nan and lists them with counting"""
        if X is None:
            X = self.get_df()
            print(self._get_columns_with_nan())
            
        nan_columns = {}
        total = X.shape[0]
        
        for column in X.columns:
            nan_count = X[column].isnull().sum()
            if nan_count > 0:
                percentage = '{:.2f}'.format(nan_count*100 / total)
                nan_columns[column] = f'{nan_count} count, {percentage}% of total'
        
        return nan_columns
        
    def repair_nan(self, X=None, colname=None, mode='value', value=0):
        """
        Repair df where nan is found in columns 
        MODES: value, delrow, mostcommon, mean, min, max, random_number
        """
        if X is None:
            X = self.get_df()
        
        if colname is None:
            colname = X.columns.tolist()

        if isinstance(colname, str):
            colname = [colname]

        for col in colname:
            if col not in X.columns:
                print(f"{col} not found DataFrame.")
                continue

            if mode == 'value':
                X[col].fillna(value, inplace=True)
            elif mode == 'delrow':
                X.dropna(subset=[col], inplace=True)
            elif mode == 'mostcommon':
                most_common = X[col].mode()[0] 
                X[col].fillna(most_common, inplace=True)
            elif mode == 'mean':
                mean_value = X[col].mean()
                X[col].fillna(mean_value, inplace=True)
            elif mode == 'min':
                min_value = X[col].min()
                X[col].fillna(min_value, inplace=True)
            elif mode == 'max':
                max_value = X[col].max()
                X[col].fillna(max_value, inplace=True)
            elif mode == 'random_number':
                mean_value = X[col].mean()
                std_dev = X[col].std()  
                def is_nan(x):
                    return pd.isna(x)
                
                def random_error(x):
                    if is_nan(x):
                        return random.gauss(mean_value, 2 * std_dev) 
                    return x
            
                X[col] = X[col].apply(random_error)            
            else:
                print(f'{mode} not found')

        return

    def get_numeric_cols(self):
        """Gets all numeric column names in a list"""
        return self.df.select_dtypes(include=['number']).columns.tolist()

    def get_text_cols(self):
        """Gets all text column names in a list"""
        return self.df.select_dtypes(include=['object']).columns.tolist()
    
    def get_categoric_cols(self):
        """Gets all categoric column names in a list"""
        return self.df.select_dtypes(include=['category']).columns.tolist()
    
    def display_col_with_nan(self):
        """Displays total of nan counts, the columns containing nan"""
        nan_count = self.df.isnull().sum()
        print('total NaN:' + str(nan_count.sum()))
        print(f'Fields containing NaN:    {nan_count[nan_count>0].sort_values(ascending=False)}')
        return self.cols_containing_NaN
    
    def drop_row_with_nan_val(self, threshold=1,axis=0,inplace=True):
        """Drops rows having a nan count over the cols over a certain threshold. Ex:threshold=1
        mantains only rows containing 1 nan"""
        self.df.dropna(threshold=threshold,axis=axis,inplace=inplace)
        return

    def drop_col(self, X=None, colnames=None):
        """Drops columns"""
        if X is None:
            X = self.df
            
        for column in colnames:
            if column in X.columns:
                X.drop(columns=column, inplace=True)
            else:
                print(f"{column} not in DataFrame.")     


        return

    
    def add_max_scaled_col(self,colname, group_by = '', prefix='max_scaled_'):
        """Add max scaled col --> -1 to 1 over a grouping (or not)"""
        new_name = prefix + colname
        if len(group_by)==0:
            self.df[new_name] = self.df[colname]  / self.df[colname].abs().max()
        else:
            self.df['max_abs_grouped_' + colname] = self.df.groupby(group_by)[colname].transform(lambda x: x.abs().max())
            self.df[new_name] = self.df[colname]/self.df['max_abs_grouped_' + colname] 
        return

    def add_normalized_col(self,colname, prefix='normalized_'):
        """Add normalized col --> 0 to 1 over a grouping (or not)"""
        new_name = prefix + colname
        self.df[new_name] = (self.df[colname] - self.df[colname].min()) / (self.df[colname].max() - self.df[colname].min())
        return

    def add_standard_col(self,colname, prefix='standard_'):
        """Add standardized col --> mean 0 standard_dev 1 over a grouping (or not)"""
        new_name = prefix + colname
        self.df[new_name] = (self.df[colname] - self.df[colname].mean()) / self.df[colname].std()     
        return


    def standard_scaling(self, col1, col2):
        """standardizes two cols """
        scaler = StandardScaler()
        self.df[[col1, col2]] = scaler.fit_transform(self.df[[col1, col2]])

    def normalize_scaling(self, col1, col2):
        """normalizes two cols """
        scaler = MinMaxScaler()
        self.df[[col1, col2]] = scaler.fit_transform(self.df[[col1, col2]])
        
    def mean(self, colname, group_by='', addCol=False):
        """Add mean col if addCol=True over a groupby"""

        if group_by:
            avg_values = self.df.groupby(group_by)[colname].mean().reset_index()
            avg_values.rename(columns={colname: 'mean_' + colname}, inplace=True)
            if addCol:
                self.df['mean_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('mean')
        else:
            avg_values = self.df[colname].mean()
            if addCol:
                self.df['mean_' + colname ] = self.df[colname].transform('mean')

            
        return avg_values

    def max(self, colname, group_by='', addCol=False):
        """Add max col if addCol=True over a groupby"""
       
        if group_by:
            max_values = self.df.groupby(group_by)[colname].max().reset_index()
            max_values.rename(columns={colname: 'max_' + colname}, inplace=True)
            if addCol:
                self.df['max_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('max')
        else:
            if addCol:
                self.df['max_' + colname] = self.df[colname].transform('max')
            max_values = self.df[colname].max()

            return max_values

    def min(self, colname, group_by='', addCol=False):
        """Add min col if addCol=True over a groupby"""
        if addCol:
            self.df['min_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('min')

        if group_by:
            min_values = self.df.groupby(group_by)[colname].min().reset_index()
            min_values.rename(columns={colname: 'min_' + colname}, inplace=True)
            if addCol:
                self.df['min_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('min')

        else:
            if addCol:
                self.df['min_' + colname] = self.df[colname].transform('min')
            min_values = self.df[colname].min()

        return min_values

    def plot_line(self, y='', x=''):
        """ Plots the df with a line. Can specify x and y"""
        plt.figure(figsize=(16,6))
        data = self.df
        if index:
            data = data.set_index(x)    
            
        if col:
            sns.lineplot(data=data[y])        
        else:
            sns.lineplot(data=data.df)        
            
        return

    def plot_distribution_line(self, col):
        """ Plots column distribution over a line from min to max"""
        plt.figure(figsize=(16,6))
        data = self.df[col].sort_values(by=col).reset_index()
        sns.lineplot(data=data)
        return
   
   
    def distribution(self, cols, num_subplots_perrow=3):
        """ Plots column distribution with histograms. Can specify multiple cols and number of subplots per row"""
        fig, axes = plt.subplots(nrows=int(np.ceil(len(cols)/num_subplots_perrow)),ncols=num_subplots_perrow,figsize=(21,7))
        axes = axes.flatten()
        for i, column in enumerate(cols):
            sns.histplot(self.df[column], kde=True, ax=axes[i])
            axes[i].set_title(f'distrib {column}')
            axes[i].set_xlabel(column)
            axes[i].set_ylabel('Freq')

        for j in range(i + 1, len(axes)):
            axes[j].axis('off')
            
        plt.tight_layout()
        plt.show()
        return



    def scatter(self, colx, coly, num_subplots_perrow=1, limit_StdDev=1):
        """Plots scatter distribution can specify multiple x, one for each subplot, but only one y
        can specify limit_percStdDev for restricting values over n standard deviations"""
        if isinstance(colx, str):
            colx = [colx]

        if num_subplots_perrow>1:
            fig, axes = plt.subplots(
                nrows=int(np.ceil(len(colx) / num_subplots_perrow)),
                ncols=num_subplots_perrow,
                figsize=(21, 7)
            )
    
            axes = axes.flatten()
        
            for i, column in enumerate(colx):
                if self.df[column].dtype in ['int64', 'float64'] and self.df[coly].dtype in ['int64', 'float64']:
                    sns.scatterplot(data=self.get_percStdDev_data(column, limit_StdDev=limit_StdDev), x=column, y=coly, ax=axes[i], color='b')
                    sns.regplot(data=self.get_percStdDev_data(column, limit_StdDev=limit_StdDev), x=column, y=coly, ax=axes[i], scatter=False, color='r', lowess=True)
                elif self.df[column].dtype in ['category', 'object'] and self.df[coly].dtype in ['int64', 'float64']:
                    sns.boxplot(data=self.df, x=column, y=coly, ax=axes[i], color='b')
                else:
                    axes[i].text(0.5, 0.5, 'unsupported data', ha='center', va='center', fontsize=12)
        
                axes[i].set_title(f'Distribuzione di {column}')
                axes[i].set_xlabel(column)
                axes[i].set_ylabel(coly)
        
            for j in range(i + 1, len(axes)):
                axes[j].axis('off')

        else:
            for i, column in enumerate(colx):
                plt.plot(figsize=(21, 7))
                print(f'colx:{column}')
                if self.df[column].dtype in ['int64', 'float64'] and self.df[column].dtype in ['int64', 'float64']:
                    sns.scatterplot(data=self.df, x=column, y=coly, color='b')
                    sns.regplot(data=df, x=column, y=coly,  scatter=False, color='r', lowess=True)
                elif self.df[column].dtype in ['category', 'object'] and self.df[coly].dtype in ['int64', 'float64']:
                    sns.boxplot(data=self.df, x=column, y=coly,  color='b')
    
        plt.tight_layout()
        plt.show()


    def get_percStdDev_data(self, colname, limit_StdDev=1):
        """ Gets data between the number of stdDev specified"""
        mean = self.df[colname].mean()
        std_dev = self.df[colname].std()
        
        inf = mean - (std_dev * limit_StdDev)
        sup = mean + (std_dev * limit_StdDev)

        df_filtered = self.df[(self.df[colname] >= inf) & (self.df[colname] <= sup)]
        return df_filtered


    def count_categories(self):
        """ Counts categories cols with their unique values"""
        categories_count = {}
        for column in self.df.select_dtypes(include=['object', 'category']).columns:
            unique_counts = self.df[column].nunique()
            total_counts = len(self.df[column])
            categories_count[column] = {'unique': unique_counts}
            sorted_results = sorted(categories_count.items(), key=lambda x: x[1]['unique'], reverse=True)
    
        return sorted_results


    def categories_threshold(self, threshold=5, under_threshold=True):
        """ Lists the colnames of categories having unique values under or over a certain threshold"""
        columns_few_unique = []
        underOver = 'under'
        for column in self.df.select_dtypes(include=['object', 'category']).columns:
            unique_count = self.df[column].nunique()  
            
            if under_threshold:
                if unique_count < threshold:  
                    columns_few_unique.append(column)
            else:
                underOver = 'over or equal'
                if unique_count >= threshold:  
                    columns_few_unique.append(column)
                
        print(f'Categories with counts {underOver} threshold of {threshold}')
        return columns_few_unique


    def factorize_categories(self, columns=None):
        """ Factorize the cols (if not specified iterates over all cols) if they do not contain nan"""
        if columns is None:
            columns = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
        elif isinstance(columns, str):
            columns = [columns]
        
        for column in columns:
            if column in self.df.columns:  
                if self.df[column].isnull().any():  
                    print(f"Cannot factorize '{column}' containing NaN.")
                else:
                    self.df[column] = pd.factorize(self.df[column])[0]  
            else:
                print(f"Column '{column}' does not exist in the DataFrame.")

    
    def label_encode(self,X=None, columns=None):
        """Label encode the cols (if not specified iterates over all cols) if they do not contain NaN"""
        if X is None:
            X = self.df
        
        if columns is None:
            columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
        elif isinstance(columns, str):
            columns = [columns]
        
        for column in columns:
            if column in X.columns:
                if X[column].isnull().any():
                    print(f"Cannot factorize '{column}' containing NaN.")
                else:
                    le = LabelEncoder()
                    X[column] = le.fit_transform(X[column]) 
            else:
                print(f"Column '{column}' does not exist in the DataFrame.")        

    
    def onehot_encode(self, X=None, columns=None, del_original_col=True):
        """One-hot encode the specified columns (if not specified, iterates over all categorical columns) if they do not contain NaN."""
    
        if X is None:
            X = self.df
    
        if columns is None:
            columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
        elif isinstance(columns, str):
            columns = [columns]
        
        for column in columns:
            if column in X.columns:  
                if X[column].isnull().any():  
                    print(f"Cannot factorize '{column}' containing NaN.")
                else:
                    onehot_encoder = OneHotEncoder(sparse=False)
                    onehot_encoded = onehot_encoder.fit_transform(X[[column]])
                    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out([column]))
                    onehot_df.columns = [f"{column}_{cat}" for cat in onehot_encoder.categories_[0]]
        
                    for new_column in onehot_df.columns:
                        X[new_column] = onehot_df[new_column]
        
                    if del_original_col:
                        X.drop(column, axis=1, errors='ignore', inplace=True)
            else:
                print(f"Column '{column}' does not exist in the DataFrame.")
        
        self.df = X  

    
    
    def correlations(self, df=None, annotation=True, limit=0):
        """
        Calculates correlation matrix over numeric cols and shows heatmap. Can tweak with abs limit of correlation
        """
        if df is None:
            df = self.df
            
        numeric_dataframe = df.select_dtypes(include=['number'])
        correlation_matrix = numeric_dataframe.corr()
        plt.figure(figsize=(10, 8))
        filtered_correlation = correlation_matrix[(correlation_matrix.abs() > limit) ]
        filtered_correlation = filtered_correlation.where(filtered_correlation.abs() > limit)
       
        sns.heatmap(filtered_correlation, annot=annotation, fmt=".2f", cmap='coolwarm', square=True, cbar=True)
        plt.title('Correlation', fontsize=16)
        plt.show()        
        return


    def get_sigma_outliers(self, col):
        """Returns a series of outliers for the given column. All data out of 3 sigma range"""
        mean = self.df[col].mean()
        std_dev = self.df[col].std()
        
        lower_limit = mean - 3 * std_dev
        upper_limit = mean + 3 * std_dev
        
        outliers = self.df[(self.df[col] < lower_limit) | (self.df[col] > upper_limit)]
        return outliers        

    def get_iqr_outliers(self, col):
        """Returns a series of outliers for the given column with inter quartile range"""
        q1 = self.df[col].quantile(0.25)
        q3 = self.df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        outlier_indices = self.df[(self.df[col] < lower_bound) | (self.df[col] > upper_bound)]
        return outlier_indices

    def get_density_outliers(self, selected_columns=None, epsilon=1.5, min_samples=5, metric='euclidean'):
        """Returns a series of outliers for the given columns with DBSCAN"""
        if selected_columns is None:
            selected_columns = self.df.columns

        numeric_cols = []
        if metric == 'euclidean':
            # If metric is euclidean the columns have to be numeric
            for col in selected_columns:
                if pd.api.types.is_numeric_dtype(self.df[col]):
                    numeric_cols.append(col)            
            
            selected_columns = numeric_cols    
        else:
            print('metric not recognized by get_density_outliers')
            return
        
        df = self.df[selected_columns]
        
        scaler = StandardScaler()
        scaled_df = scaler.fit_transform(df)
        dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric)
        labels = dbscan.fit_predict(scaled_df)
        outliers = df[labels == -1]
        return outliers
        
    
    def remove_outliers(self, col, method='sigma', epsilon=1.5, min_samples=5, metric='euclidean'):
        """Removes the rows of outliers for the given column"""
        if(method=='sigma'):
            outliers = self.get_sigma_outliers(col)
        elif(method=='IQR'):
            outliers = self.get_iqr_outliers(col)
        elif(method=='density'):
            outliers = self.get_density_outliers(col)

        
        self.df = self.df.drop(outliers.index)
        self.df.reset_index(drop=True, inplace=True)
        return         

    def boxplot_outliers(self, col):
        """Visualize outliers of column with boxplot. Out of wiskers are outliers"""
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=self.df[col])
        plt.title(f'Boxplot di {col}')
        plt.xlabel(col)
        plt.show()    

    def pairplot_relations(self, cols=None):
        """Visualize relations between couples of features"""
        if cols is not None:
            sns.pairplot(self.df)
            plt.show()
        return

    def features_importance(self, X=None, y=None, n_features_to_vis=25):
        """Plots the importance of each feature calculated by RandomForestClassifier
        returns importances_df a df of importances
        """
        if X is None:
            X = self.df
            
        if X is not None and y is not None:
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X, y)
            importances = model.feature_importances_

            importances_df = pd.DataFrame({
                'Feature': X.columns,
                'Importance': importances
            })

            features = np.array(X.columns) 
            indices = np.argsort(importances)[::-1]
            
            plt.figure(figsize=(10, 6))
            plt.title('Feature Importance - Random Forest')
            plt.barh(importances_df['Feature'].head(n_features_to_vis), importances[indices][:n_features_to_vis])
            plt.xlabel('Importance')
            plt.ylabel('Feature')
            plt.tight_layout()
            plt.savefig('feature_importances.png', bbox_inches='tight')
            print(importances_df)
        else:
            print('Specify X and y')
        return importances_df
        
    def mutual_info_scores(self, X=None, y=None):
        if X is None:
            X = self.get_X()

        if y is None:
            y = self.get_y()
        
        for colname in X.select_dtypes(["object", "category"]):
            X[colname], _ = X[colname].factorize()

        # All discrete features should now have integer dtypes
        discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
        mi_scores = mutual_info_regression(X, y)
        mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
        mi_scores = mi_scores.sort_values(ascending=False)
        return mi_scores

    
    def plot_mutual_info_scores(self,scores):
        scores = scores.sort_values(ascending=True)
        width = np.arange(len(scores))
        ticks = list(scores.index)
        plt.barh(width, scores)
        plt.yticks(width, ticks)
        plt.yticks(rotation=45)  
        plt.title("Mutual Information Scores")




def load_csv(file_path, parse_dates=True):
    df = pd.read_csv(file_path,  parse_dates=parse_dates)
    handler = PandasHandler(df)
    handler.set_features(df.columns.tolist())
    return handler


