<a href="https://colab.research.google.com/github/Jnitz04/Nitz/blob/main/DataPrep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
class DataPrep:
    '''
    Import: pandas (pd), numpy (np), plotly.express (px), dash (dcc, html)
            dash.dependencies import Input, Output
    __init__(self, data): Initializes the class
    cat_impact(data,target,cat_var): ????
    hist(data,x_var,num_bins=30): Makes Histograms
    snake_col(df): Converts all column names of a dataframe to 'snake case'.  snake_case, my_data, retail_price, yoy_sales
    drop_useless(df): Removes columns that do not have interesting data because they are empty or have the same data for every category
    outliers2median(df, column, threshold=3): Replace outliers with the median
    reformat_data(self): Reformat the data.
    explore_data(self): Explore the data.
    clean_dirty(self): Clean up missing or extreme values.
    transform_data(self): Perform data transformations and aggregations.

    '''
import pandas as pd
import numpy as np
import plotly.express as px# Create dropdown options
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output

pd.set_option('display.max_columns', None)

    def __init__(self, data):
        """
        Initialize the DataPreparation class with the input data.

        Parameters:
            data (pd.DataFrame): The input data as a Pandas DataFrame.
        """
        self.data = data

    def cat_impact(data,target,cat_var):
        '''
        Prediction of numeric target variables often involves
        the use of categorical variables.  Important categorical
        variables are "encoded" - each value of the categorical
        variable is transformed into its own variable containing
        0s and 1s to indicate when that value is "on" or when
        it is "off".  Typically, we don't want to include variables
        that don't show any evidence of having an actual impact
        on the thing we're trying to predict.  This function
        helps us quickly determine where to focus our attention
        when it comes to categorical variables and their impacts
        on another variable of interest.
        '''
        # Compute mean of target variable over ALL records
        # we'll use this for comparison purposes
        global_mean = data[target].mean()

        df_group = data.groupby(by=cat_var)[target].agg(['mean']).round(2)
        df_group['diff'] = (df_group['mean'] - global_mean).round(2)
        df_group['diff_pct'] = (df_group['diff']/global_mean).round(3)
        df_group['Message'] = df_group['diff_pct'].apply(
                lambda x: "Effects Target" if abs(x) > 0.1 else "No Impact")

        display(df_group)

    def hist(data,x_var,num_bins=30):
    fig = px.histogram(
        data,
        x = x_var,
        nbins = num_bins,  # Adjust the number of bins
        histnorm='percent',  # Set histogram normalization
        title=f'Distribution of {x_var}',  # Set the plot title
        template='plotly_dark',  # Use a dark theme
        color_discrete_sequence=['royalblue'],  # Set bar color
        height=300,
        width=700
        )

    # Customize the layout
    fig.update_xaxes(title_text=f'{x_var}')  # Set X-axis title
    fig.update_yaxes(title_text='Percentage of Beers')  # Set Y-axis title
    fig.update_traces(marker=dict(line=dict(color='black', width=1)))  # Add black borders to bars

    # Show the plot
    fig.show()

    def snake_col(df):
    """
    Converts all column names of a dataframe to 'snake case'.  snake_case, my_data, retail_price, yoy_sales
    If changes are made to the original column names,
    returns a dataframe with two columns: 'original_name' and 'new_name'.

    Parameters:
    - df (DataFrame): The dataframe with columns to be renamed.

    Returns:
    - changes_df (DataFrame) or None: A dataframe with 'original_name' and 'new_name' columns if changes were made, else None.
    """
    original_names = df.columns.tolist()
    new_names = []
    for col in original_names:
        col = col.replace('#', 'number')  # Replace '#' with 'number'
        col = col.lower().replace(' ', '_')
        # Remove any other non-alphanumeric characters (except underscores)
        col = ''.join(ch for ch in col if ch.isalnum() or ch == '_')
        new_names.append(col)
    df.columns = new_names

    # Check if any changes were made
    changes = [(orig, new) for orig, new in zip(original_names, new_names) if orig != new]
    if changes:
        changes_df = pd.DataFrame(changes, columns=['original_name', 'new_name'])
        print(changes_df)
        return df
    else:
        return df

    def drop_useless(df):
        '''
        Removes columns that do not have interesting data because they are empty or have the same data for every category
        '''
        unique_counts = {}
        for col in df.columns:
            num_unique = df[col].nunique() # How many unique numbersin each Col.
            if num_unique < 2:
                unique_counts[col] = num_unique
        order = 0
        for key, value in unique_counts.items(): # Prints values that were dropped
            print(f"{order} {key}: {value}")
            order += 1
            df.head(3)
        keep_col = input("Delete all files listed? y or n")
        cont = 0
        while cont < 3:
            if cont >= 2:
                df.drop(columns=list(unique_counts.keys()), inplace=True)
                cont = 5
                return df
            else:
                if keep_col == 'y':
                    df.drop(columns=list(unique_counts.keys()), inplace=True)
                    cont = 5
                    return df
                elif keep_col == 'n':
                    print("Deletion Aborted")
                    cont = 3
                else:
                    warning = ['Oops! your entry was in the wrong format. Please type y for yes OR n for no', 'Last chance! Type n to cancel or anything else to continues']
                    keep_col = input(warning[cont])
                    cont += 1

    def outliers2median(df, column, threshold=3):
        median = df[column].median()
        std_dev = df[column].std()
        row_filter = np.abs(df[column] - median) > threshold * std_dev
        num_outliers = row_filter.sum()  # Count the number of outliers
        df.loc[row_filter, column] = median

        # Print the number of records replaced
        print(f"Number of records in '{column}' replaced with median: {num_outliers}")
        return df

    def reformat_data(self):
        """
        Reformat the data.

        Placeholder for reformatting columns.
        """
        print("Step 1: Reformatting data")
        # Add code here to reformat data

    def explore_data(self):
        """
        Explore the data.

        Placeholder for data exploration tasks.
        """
        print("Step 2: Exploring data")
        # Add code here for data exploration

    def clean_dirty (self):
        """
        Clean up missing or extreme values.

        Placeholder for data cleaning tasks.
        """
        print("Step 3: Cleaning up missing or extreme values")
        # Add code here for data cleaning

    def transform_data(self):
        """
        Perform data transformations and aggregations.

        Placeholder for data transformation and aggregation tasks.
        """
        print("Step 4: Performing data transformations and aggregations")
        # Add code here for data transformations and aggregations


    def plots(self):
        """
        Display plots summarizing important insights in the data.

        Placeholder for displaying data summary plots.
        """
        print("Step 5: Displaying plots summarizing insights in the data")
        # Add code here for displaying plots



