In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random


warnings.filterwarnings('ignore')

class PandasHandler:
    """
    Handler for basic operations on dataframe pandas
    visualization, nan inspection and repair
    LOADING OF Dataframe
    load_csv(file_path, parse_dates=True):
    def get_df(self, columns=''):
    
    # Investigation
    def get_col_with_nan(self):
    def get_numeric_cols(self):
    def get_text_cols(self):    
    def get_categoric_cols(self):
    def display_col_with_nan(self):
    def get_percStdDev_data(self, colname, limit_percStdDev=1):
    def count_categories(self):

    # Repair
    def repair_nan(self, colname=None, mode='value', value=0):
    def drop_row_with_nan_val(self, threshold=1,axis=0,inplace=True):
    def drop_col(self,colnames):
    def categories_threshold(self, threshold=5, under_threshold=True):
    def factorize_categories(self, columns=None):

    # Add utility cols
    def add_max_scaled_col(self,colname, group_by = '', prefix='max_scaled_'):
    def add_normalized_col(self,colname, prefix='normalized_'):
    def add_standard_col(self,colname, prefix='standard_'):
    def mean(self, colname, group_by='', addCol=False):
    def max(self, colname, group_by='', addCol=False):
    def min(self, colname, group_by='', addCol=False):

    # Visualization
    def plot_line(self, y='', x=''):
    def plot_distribution_line(self, col):
    def distribution(self, cols, num_subplots_perrow=3):
    def scatter(self, colx, coly, num_subplots_perrow=3, limit_percStdDev=1):

    
    """
    def __init__(self, data):
        self.df = pd.DataFrame(data)
        self.cols_containing_NaN = self._get_columns_with_nan()
    
    def _get_columns_with_nan(self):
        return self.df.columns[self.df.isna().any()].tolist()

    def get_df(self, columns=''):
        """Gets the dataframe"""
        if not columns:
            return self.df
        else:
            return self.df[columns]

    def get_col_with_nan(self):
        """Gets all column names with nan and lists them with counting"""
        print(self._get_columns_with_nan())
        nan_columns = {}
        total = self.df.shape[0]
        
        for column in self.df.columns:
            nan_count = self.df[column].isnull().sum()
            if nan_count > 0:
                percentage = '{:.2f}'.format(nan_count*100 / total)
                nan_columns[column] = f'{nan_count} count, {percentage}% of total'
        
        return nan_columns
        
    def repair_nan(self, colname=None, mode='value', value=0):
        """
        Repair df where nan is found in columns 
        MODES: value, delrow, mostcommon, mean, min, max, random_number
        """
        if colname is None:
            colname = self.df.columns.tolist()

        if isinstance(colname, str):
            colname = [colname]

        for col in colname:
            if col not in self.df.columns:
                print(f"{col} not found DataFrame.")
                continue

            if mode == 'value':
                self.df[col].fillna(value, inplace=True)
            elif mode == 'delrow':
                self.df.dropna(subset=[col], inplace=True)
            elif mode == 'mostcommon':
                most_common = self.df[col].mode()[0] 
                print(f'mostcommon {most_common}')
                self.df[col].fillna(most_common, inplace=True)
            elif mode == 'mean':
                mean_value = self.df[col].mean()
                self.df[col].fillna(mean_value, inplace=True)
            elif mode == 'min':
                min_value = self.df[col].min()
                self.df[col].fillna(min_value, inplace=True)
            elif mode == 'max':
                max_value = self.df[col].max()
                self.df[col].fillna(max_value, inplace=True)
            elif mode == 'random_number':
                mean_value = self.df[col].mean()
                std_dev = self.df[col].std()  
                def is_nan(x):
                    return pd.isna(x)
                
                def random_error(x):
                    if is_nan(x):
                        return random.gauss(mean_value, 2 * std_dev) 
                    return x
            
                self.df[col] = self.df[col].apply(random_error)            
            else:
                print(f'{mode} not found')

        return

    def get_numeric_cols(self):
        """Gets all numeric column names in a list"""
        return self.df.select_dtypes(include=['number']).columns.tolist()

    def get_text_cols(self):
        """Gets all text column names in a list"""
        return self.df.select_dtypes(include=['object']).columns.tolist()
    
    def get_categoric_cols(self):
        """Gets all categoric column names in a list"""
        return self.df.select_dtypes(include=['category']).columns.tolist()
    
    def display_col_with_nan(self):
        """Displays total of nan counts, the columns containing nan"""
        nan_count = self.df.isnull().sum()
        print('total NaN:' + str(nan_count.sum()))
        print(f'Fields containing NaN:    {nan_count[nan_count>0].sort_values(ascending=False)}')
        return self.cols_containing_NaN
    
    def drop_row_with_nan_val(self, threshold=1,axis=0,inplace=True):
        """Drops rows having a nan count over the cols over a certain threshold. Ex:threshold=1
        mantains only rows containing 1 nan"""
        self.df.dropna(threshold=threshold,axis=axis,inplace=inplace)
        return

    def drop_col(self,colnames):
        """Drops columns"""
        self.df = self.df.drop(columns=colnames)
        return


    
    def add_max_scaled_col(self,colname, group_by = '', prefix='max_scaled_'):
        """Add max scaled col --> -1 to 1 over a grouping (or not)"""
        new_name = prefix + colname
        if len(group_by)==0:
            self.df[new_name] = self.df[colname]  / self.df[colname].abs().max()
        else:
            self.df['max_abs_grouped_' + colname] = self.df.groupby(group_by)[colname].transform(lambda x: x.abs().max())
            self.df[new_name] = self.df[colname]/self.df['max_abs_grouped_' + colname] 
        return

    def add_normalized_col(self,colname, prefix='normalized_'):
        """Add normalized col --> 0 to 1 over a grouping (or not)"""
        new_name = prefix + colname
        self.df[new_name] = (self.df[colname] - self.df[colname].min()) / (self.df[colname].max() - self.df[colname].min())
        return

    def add_standard_col(self,colname, prefix='standard_'):
        """Add standardized col --> mean 0 standard_dev 1 over a grouping (or not)"""
        new_name = prefix + colname
        self.df[new_name] = (self.df[colname] - self.df[colname].mean()) / self.df[colname].std()     
        return

    def mean(self, colname, group_by='', addCol=False):
        """Add mean col if addCol=True over a groupby"""

        if group_by:
            avg_values = self.df.groupby(group_by)[colname].mean().reset_index()
            avg_values.rename(columns={colname: 'mean_' + colname}, inplace=True)
            if addCol:
                self.df['mean_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('mean')
        else:
            avg_values = self.df[colname].mean()
            if addCol:
                self.df['mean_' + colname ] = self.df[colname].transform('mean')

            
        return avg_values

    def max(self, colname, group_by='', addCol=False):
        """Add max col if addCol=True over a groupby"""
       
        if group_by:
            max_values = self.df.groupby(group_by)[colname].max().reset_index()
            max_values.rename(columns={colname: 'max_' + colname}, inplace=True)
            if addCol:
                self.df['max_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('max')
        else:
            if addCol:
                self.df['max_' + colname] = self.df[colname].transform('max')
            max_values = self.df[colname].max()

            return max_values

    def min(self, colname, group_by='', addCol=False):
        """Add min col if addCol=True over a groupby"""
        if addCol:
            self.df['min_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('min')

        if group_by:
            min_values = self.df.groupby(group_by)[colname].min().reset_index()
            min_values.rename(columns={colname: 'min_' + colname}, inplace=True)
            if addCol:
                self.df['min_' + colname + '_group_' + group_by] = self.df.groupby(group_by)[colname].transform('min')

        else:
            if addCol:
                self.df['min_' + colname] = self.df[colname].transform('min')
            min_values = self.df[colname].min()

        return min_values

    def plot_line(self, y='', x=''):
        """ Plots the df with a line. Can specify x and y"""
        plt.figure(figsize=(16,6))
        data = self.df
        if index:
            data = data.set_index(x)    
            
        if col:
            sns.lineplot(data=data[y])        
        else:
            sns.lineplot(data=data.df)        
            
        return

    def plot_distribution_line(self, col):
        """ Plots column distribution over a line from min to max"""
        plt.figure(figsize=(16,6))
        data = self.df[col].sort_values(by=col).reset_index()
        sns.lineplot(data=data)
        return
   
   
    def distribution(self, cols, num_subplots_perrow=3):
        """ Plots column distribution with histograms. Can specify multiple cols and number of subplots per row"""
        fig, axes = plt.subplots(nrows=int(np.ceil(len(cols)/num_subplots_perrow)),ncols=num_subplots_perrow,figsize=(21,7))
        axes = axes.flatten()
        for i, column in enumerate(cols):
            sns.histplot(self.df[column], kde=True, ax=axes[i])
            axes[i].set_title(f'distrib {column}')
            axes[i].set_xlabel(column)
            axes[i].set_ylabel('Freq')

        for j in range(i + 1, len(axes)):
            axes[j].axis('off')
            
        plt.tight_layout()
        plt.show()
        return



    def scatter(self, colx, coly, num_subplots_perrow=3, limit_StdDev=1):
        """Plots scatter distribution can specify multiple x, one for each subplot, but only one y
        can specify limit_percStdDev for restricting values over n standard deviations"""
        if isinstance(colx, str):
            colx = [colx]
    
        fig, axes = plt.subplots(
            nrows=int(np.ceil(len(colx) / num_subplots_perrow)),
            ncols=num_subplots_perrow,
            figsize=(21, 7)
        )
        axes = axes.flatten()
    
        for i, column in enumerate(colx):
            if self.df[column].dtype in ['int64', 'float64'] and self.df[coly].dtype in ['int64', 'float64']:
                sns.scatterplot(data=self.get_percStdDev_data(column, limit_StdDev=limit_StdDev), x=column, y=coly, ax=axes[i], color='b')
                sns.regplot(data=self.get_percStdDev_data(column, limit_StdDev=limit_StdDev), x=column, y=coly, ax=axes[i], scatter=False, color='r', lowess=True)
            elif self.df[column].dtype in ['category', 'object'] and self.df[coly].dtype in ['int64', 'float64']:
                sns.boxplot(data=self.df, x=column, y=coly, ax=axes[i], color='b')
            else:
                axes[i].text(0.5, 0.5, 'unsupported data', ha='center', va='center', fontsize=12)
    
            axes[i].set_title(f'Distribuzione di {column}')
            axes[i].set_xlabel(column)
            axes[i].set_ylabel(coly)
    
        for j in range(i + 1, len(axes)):
            axes[j].axis('off')
            
        plt.tight_layout()
        plt.show()


    def get_percStdDev_data(self, colname, limit_StdDev=1):
        """ Gets data between the number of stdDev specified"""
        mean = self.df[colname].mean()
        std_dev = self.df[colname].std()
        
        inf = mean - (std_dev * limit_StdDev)
        sup = mean + (std_dev * limit_StdDev)

        df_filtered = self.df[(self.df[colname] >= inf) & (self.df[colname] <= sup)]
        return df_filtered


    def count_categories(self):
        """ Counts categories cols with their unique values"""
        categories_count = {}
        for column in self.df.select_dtypes(include=['object', 'category']).columns:
            unique_counts = self.df[column].nunique()
            total_counts = len(self.df[column])
            categories_count[column] = {'unique': unique_counts}
            sorted_results = sorted(categories_count.items(), key=lambda x: x[1]['unique'], reverse=True)
    
        return sorted_results


    def categories_threshold(self, threshold=5, under_threshold=True):
        """ Lists the colnames of categories having unique values under or over a certain threshold"""
        columns_few_unique = []
        underOver = 'under'
        for column in self.df.select_dtypes(include=['object', 'category']).columns:
            unique_count = self.df[column].nunique()  
            
            if under_threshold:
                if unique_count < threshold:  
                    columns_few_unique.append(column)
            else:
                underOver = 'over or equal'
                if unique_count >= threshold:  
                    columns_few_unique.append(column)
                
        print(f'Categories with counts {underOver} threshold of {threshold}')
        return columns_few_unique


    def factorize_categories(self, columns=None):
        """ Factorize the cols (if not specified iterates over all cols) if they do not contain nan"""
        if columns is None:
            columns = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
        elif isinstance(columns, str):
            columns = [columns]
        
        for column in columns:
            if column in self.df.columns:  
                if self.df[column].isnull().any():  
                    print(f"Cannot factorize '{column}' containing NaN.")
                else:
                    self.df[column] = pd.factorize(self.df[column])[0]  
            else:
                print(f"Column '{column}' does not exist in the DataFrame.")
                


def load_csv(file_path, parse_dates=True):
    df = pd.read_csv(file_path,  parse_dates=parse_dates)
    handler = PandasHandler(df)
    return handler


