DATA UNDERSTANDING
Bank Dataset Analyzer
This class, BankDatasetAnalyzer, is designed to analyze a dataset related to banking. It provides various methods to gather insights about the dataset, identify patterns, and address data quality issues. This class is part of a broader data analysis and preprocessing process.

How It Works
Initialization
Import the necessary libraries: pandas, matplotlib.pyplot, seaborn, and numpy.

Define a class named BankDatasetAnalyzer that takes the path to the dataset as a parameter during initialization (dataset_path).

Inside the __init__ method, the dataset is loaded using pd.read_csv() and stored as an attribute (self.dataset).

Data Exploration and Analysis Methods
The class provides several methods to explore and analyze the dataset:

preview_dataset(num_rows): Displays the first and last few rows of the dataset.

check_columns_list(): Returns a list of columns in the dataset.

explore_column_value_counts(): Prints the value counts for each column in the dataset.

get_dataset_shape(): Returns the number of rows and columns in the dataset.

get_dataset_info(): Displays information about the dataset, including data types and memory usage.

get_dataset_description(): Provides a statistical description of the dataset, including measures like mean, min, max, and quartiles.

analyze_duplicated_rows(): Counts and visualizes the percentage of duplicated rows in the dataset.

analyze_missing_values(): Counts and visualizes the percentage of missing values for each column in the dataset.

count_data_types(): Returns the count of data types present in the dataset.

count_unique_values_by_dtype(): Counts and prints the number of unique values for integer, float, and object columns.

Each method is designed to provide insights into different aspects of the dataset, helping to identify data quality issues, patterns, and trends.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

class BankDatasetAnalyzer:
    def __init__(self, dataset_path):
        # Initialize the BankDatasetAnalyzer with the dataset path
        self.dataset_path = dataset_path
        self.dataset = pd.read_csv(dataset_path, delimiter=";")
        
    def preview_dataset(self, num_rows=5):
        # Display the first few and last few rows of the dataset
        print(f"Preview of the dataset (first {num_rows} rows):")
        print(self.dataset.head(num_rows))
        print(f"Preview of the dataset (last {num_rows} rows):")
        print(self.dataset.tail(num_rows))

    def check_columns_list(self):
        # Get the list of columns in the dataset
        return self.dataset.columns.tolist()

    def explore_column_value_counts(self):
        # Explore the value counts for each column
        column_value_counts = {}

        for column in self.dataset.columns:
            column_value_counts[column] = self.dataset[column].value_counts()

        for column, values in column_value_counts.items():
            print(f"Value Counts for Column '{column}':\n{values}\n")

    def get_dataset_shape(self):
        # Get the shape of the dataset
        return self.dataset.shape

    def get_dataset_info(self):
        # Get information about the dataset
        return self.dataset.info()

    def get_dataset_description(self):
        # Get a statistical description of the dataset
        return self.dataset.describe()

    def analyze_duplicated_rows(self):
        # Analyze duplicate rows
        duplicate_count = self.dataset.duplicated().sum()

        # Print duplicate rows count
        print("Duplicate Rows Count:", duplicate_count)

        # Calculate the percentage of duplicate rows
        duplicate_percentage = (duplicate_count / len(self.dataset)) * 100

        # Create a bar chart to visualize the percentage of duplicate rows
        plt.figure(figsize=(6, 4))
        sns.barplot(x=["Duplicate Rows", "Non-Duplicate Rows"],
                    y=[duplicate_percentage, 100 - duplicate_percentage], color='skyblue', edgecolor='black')
        plt.ylabel("Percentage (%)")
        plt.title("Percentage of Duplicated Rows")
        plt.tight_layout()
        plt.show()
    
    def analyze_missing_values(self):
        # Analyze missing values
        missing_values_count = self.dataset.isnull().sum()

        # Print missing values counts
        print("Missing Values Counts:")
        print(missing_values_count)

         # Calculate the percentage of missing values for each column
        missing_percentage = (self.dataset.isnull().sum() / len(self.dataset)) * 100

        # Create a bar chart to visualize the percentage of missing values
        plt.figure(figsize=(10, 6))
        sns.barplot(x=missing_percentage.index, y=missing_percentage.values, color='skyblue', edgecolor='black')
        plt.xlabel('Columns')
        plt.ylabel('Percentage of Missing Values')
        plt.title('Percentage of Missing Values in Each Column')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    def count_data_types(self):
        # Count the data types in the dataset
        return self.dataset.dtypes.value_counts()

    def count_unique_values_by_dtype(self):
         # Group columns by data type
        data_types = self.dataset.dtypes

        # Separate columns by data type
        int_columns = data_types[data_types == 'int64'].index
        float_columns = data_types[data_types == 'float64'].index
        object_columns = data_types[data_types == 'object'].index
        
        print("Unique Values Counts by Data Type:")

        # Print columns and their counts for each data type
        print("Integer Columns:")
        print(self.dataset[int_columns].nunique())

        print("\nFloat Columns:")
        print(self.dataset[float_columns].nunique())

        print("\nObject Columns:")
        print(self.dataset[object_columns].nunique())

In [None]:
# Create an instance of BankDatasetAnalyzer
analyzer = BankDatasetAnalyzer("data/bank/bank-full.csv")

In [None]:
# Preview the dataset
analyzer.preview_dataset()

In [None]:
# Get the list of columns in the dataset
columns_list = analyzer.check_columns_list()
print("Columns in the dataset:", columns_list)

In [None]:
# Explore value counts for each column
analyzer.explore_column_value_counts()

In [None]:
# Get the shape of the dataset
dataset_shape = analyzer.get_dataset_shape()
print("Shape of the dataset:", dataset_shape)

In [None]:
# Get information about the dataset
dataset_info = analyzer.get_dataset_info()
print("Information about the dataset:")
print(dataset_info)

In [None]:
# Get a statistical description of the dataset
dataset_description = analyzer.get_dataset_description()
print("Statistical description of the dataset:")
print(dataset_description)

In [None]:
# Analyze duplicate rows and visualize the percentage
analyzer.analyze_duplicated_rows()

In [None]:
# Analyze missing values and visualize the percentage
analyzer.analyze_missing_values()

In [None]:
# Count data types in the dataset
data_type_counts = analyzer.count_data_types()
print("Data types counts:")
print(data_type_counts)

In [None]:
# Count unique values by data type
analyzer.count_unique_values_by_dtype()

EDA

In [None]:
mport matplotlib.pyplot as plt
import seaborn as sns

class UnivariateVisualizer:
    def __init__(self, data):
        self.data = data
        self.categorical_vars = self.data.select_dtypes(include=['object']).columns.tolist()
        self.continuous_vars = self.data.select_dtypes(include=['int64', 'float64']).columns.tolist()
        sns.set_palette("pastel")

    def plot_categorical_distribution(self):
        num_cat_vars = len(self.categorical_vars)
        num_rows = (num_cat_vars + 1) // 2  # Adjust number of rows
        num_cols = 2  # Fixed number of columns

        fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(18, 6 * num_rows))
        for ax, cat_var in zip(axes.flatten(), self.categorical_vars):
            sns.countplot(data=self.data, x=cat_var, ax=ax)
            ax.set_title(f'Distribution of {cat_var}\n')
            ax.set_ylabel('Count')
            ax.set_xlabel(cat_var)
            ax.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.show()

    def plot_continuous_distribution(self):
        num_cont_vars = len(self.continuous_vars)
        num_rows = (num_cont_vars + 2) // 3  # Adjust number of rows
        num_cols = 3  # Fixed number of columns

        fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(18, 6 * num_rows))
        for ax, cont_var in zip(axes.flatten(), self.continuous_vars):
            sns.histplot(data=self.data, x=cont_var, bins=20, kde=True, ax=ax)
            ax.set_title(f'Distribution of {cont_var}\n')
            ax.set_ylabel('Count')
            ax.set_xlabel(cont_var)
            ax.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.show()
# Create an instance of the UnivariateVisualizer class
visualizer = UnivariateVisualizer(analyzer.dataset)
# Plot categorical distribution
visualizer.plot_categorical_distribution()