In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

class PandasHandler:
    def __init__(self, data):
        self.df = pd.DataFrame(data)
        self.cols_containing_NaN = self._get_columns_with_nan()
    
    def _get_columns_with_nan(self):
        return self.df.columns[self.df.isna().any()].tolist()

    def get_df(self, columns=''):
        if not columns:
            return self.df
        else:
            return self.df[columns]

    def get_col_with_nan(self):
        return self._get_columns_with_nan()

    def get_numeric_cols(self):
        return self.df.select_dtypes(include=['number']).columns.tolist()

    def get_text_cols(self):
        return self.df.select_dtypes(include=['object']).columns.tolist()
    
    def get_categoric_cols(self):
        return self.df.select_dtypes(include=['category']).columns.tolist()
    
    def display_col_with_nan(self):
        nan_count = self.df.isnull().sum()
        print('total NaN:' + str(nan_count.sum()))
        print(f'Fields containing NaN:    {nan_count[nan_count>0].sort_values(ascending=False)}')
        return self.cols_containing_NaN
    
    def drop_row_with_nan_val(self, threshold=1,axis=0,inplace=True):
        self.df.dropna(threshold=threshold,axis=axis,inplace=inplace)
        return

    def drop_col(self,colnames):
        self.df = self.df.drop(columns=colnames)
        return

    def drop_all_cols_with_nan(self, perc_treshold=50):
        remove_col_with_nan(self._get_columns_with_nan())
        return

    # Add max scaled col --> -1 to 1
    def add_max_scaled_col(self,colname, group_by = '', prefix='max_scaled_'):
        new_name = prefix + colname
        if len(group_by)==0:
            self.df[new_name] = self.df[colname]  / self.df[colname].abs().max()
        else:
            self.df['max_abs_grouped_' + colname] = self.df.groupby(group_by)[colname].transform(lambda x: x.abs().max())
            self.df[new_name] = self.df[colname]/self.df['max_abs_grouped_' + colname] 
        return

    # Add max scaled col --> 0 to 1
    def add_normalized_col(self,colname, prefix='normalized_'):
        new_name = prefix + colname
        self.df[new_name] = (self.df[colname] - self.df[colname].min()) / (self.df[colname].max() - self.df[colname].min())
        return

    # Add standardized col --> mean 0 standard_dev 1
    def add_standard_col(self,colname, prefix='standard_'):
        new_name = prefix + colname
        self.df[new_name] = (self.df[colname] - self.df[colname].mean()) / self.df[colname].std()     
        return

    def repair_na_cols_with_val(self, colname, val):
        self.df[colname] = self.df[colname].fillna(val)
        return

    def get_avg_col(self, colname, group_by=''):
        if group_by:
            return self.df.groupby(group_by)[colname].transform('mean')
        else:
            return self.df[colname].mean()

    def get_max_col(self, colname, group_by=''):
        if group_by:
            return self.df.groupby(group_by)[colname].transform('max')
        else:
            return self.df[colname].max()

    def get_min_col(self, colname, group_by=''):
        if group_by:
            return self.df.groupby(group_by)[colname].transform('min')
        else:
            return self.df[colname].min()

    def plot_line(self, col='', index=''):
        plt.figure(figsize=(16,6))
        data = self.df
        if index:
            data = data.set_index(index)    
            
        if col:
            sns.lineplot(data=data[col])        
        else:
            sns.lineplot(data=data.df)        
            
        return

    def plot_distribution_line(self, col):
        plt.figure(figsize=(16,6))
        data = self.df[col].sort_values(by=col).reset_index()
        sns.lineplot(data=data)
        return
   
   
    def plot_distribution(self, cols):
        fig, axes = plt.subplots(nrows=int(np.ceil(len(cols)/3)),ncols=3,figsize=(15,5))
        axes = axes.flatten()
        for i, column in enumerate(cols):
            sns.histplot(self.df[column], kde=True, ax=axes[i])
            axes[i].set_title(f'distrib {column}')
            axes[i].set_xlabel(column)
            axes[i].set_ylabel('Freq')

        for j in range(i + 1, len(axes)):
            axes[j].axis('off')
            
        plt.tight_layout()
        plt.show()
        return

def load_csv(file_path, parse_dates=True):
    df = pd.read_csv(file_path,  parse_dates=parse_dates)
    handler = PandasHandler(df)
    return handler


