In [7]:
## System ##
import os

# Math ##
import pandas as pd

## Graphs ##
import matplotlib.pyplot as plt
import seaborn as sns

## Display ##
from IPython.display import display, HTML
import io
import base64
from io import BytesIO
import ipywidgets as widgets

def setup_working_directory():
    """
    Sets up the working directory by detecting the current working directory
    and listing the folders in it. Creates global variables for each folder.

    Returns:
    None
    """
    current_dir = os.getcwd()
    global folder_paths
    folder_paths = {folder: os.path.join(current_dir, folder) for folder in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, folder))}
    
    for folder_name, folder_path in folder_paths.items():
        globals()[folder_name] = folder_path
    
    # Print the folder paths
    print("Folder paths:", folder_paths)

def gather_dataframes():
    """
    Gathers dataframes from all files in the specified folders.
    Creates global variables for each dataframe.

    Returns:
    None
    """
    gathered_dataframes = []
    for folder_name, folder_path in folder_paths.items():
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if file.endswith('.csv'):
                df_name = os.path.splitext(file)[0]
                globals()[df_name] = pd.read_csv(file_path)
                gathered_dataframes.append(df_name)
            elif file.endswith('.xlsx'):
                df_name = os.path.splitext(file)[0]
                globals()[df_name] = pd.read_excel(file_path)
                gathered_dataframes.append(df_name)
            else:
                raise ValueError(f"Unsupported file format for {file_path}.")
    
    # Print the gathered dataframes
    print("Dataframes gathered:", gathered_dataframes)


def display_preliminary_info(df, target_variable=None, sample_size=None, show_preview=True, show_basic_info=True, show_missing_values=True, 
                             show_statistics=True, show_features=True, show_distributions=True, 
                             show_bar_charts=True, show_correlation=True, show_pairplot=True, 
                             show_value_counts=True):
    """
    Displays preliminary information for a given dataframe.

    Parameters:
    df (pd.DataFrame): The dataframe to analyze.
    target_variable (str): The target variable for additional analysis.
    sample_size (int or None): The maximum number of rows to use for analysis. If None, use the entire dataframe.
    show_preview (bool): Whether to display the preview of the dataframe.
    show_basic_info (bool): Whether to display the basic information of the dataframe.
    show_missing_values (bool): Whether to display the missing values in the dataframe.
    show_statistics (bool): Whether to display the basic statistics of the dataframe.
    show_features (bool): Whether to display the features of the dataframe.
    show_distributions (bool): Whether to display the distributions of numerical features.
    show_bar_charts (bool): Whether to display the bar charts for categorical features.
    show_correlation (bool): Whether to display the correlation heatmap for numerical features.
    show_pairplot (bool): Whether to display the pair plot for numerical features.
    show_value_counts (bool): Whether to display the value counts for categorical features.

    Returns:
    None
    """
    if sample_size is not None and len(df) > sample_size:
        df = df.sample(n=sample_size, random_state=42)

    # Define features early to avoid UnboundLocalError
    categorical_features = df.select_dtypes(include=['object']).columns
    numerical_features = df.select_dtypes(include=['number']).columns

    tabs = widgets.Tab()

    # Initialize empty children list for tabs
    children = []
    titles = []

    # Preview of the dataframe
    if show_preview:
        preview_output = widgets.Output()
        with preview_output:
            display(HTML("<h2>Preview of the Dataframe</h2>"))
            display(HTML(df.head().to_html()))
        children.append(preview_output)
        titles.append("Preview")

    # Basic information
    if show_basic_info:
        basic_info_output = widgets.Output()
        with basic_info_output:
            display(HTML("<h2>Basic Information</h2>"))
            buffer = io.StringIO()
            df.info(buf=buffer)
            s = buffer.getvalue()
            display(HTML(f"<pre>{s}</pre>"))
        children.append(basic_info_output)
        titles.append("Basic Info")

    # Missing values
    if show_missing_values:
        missing_values_output = widgets.Output()
        with missing_values_output:
            display(HTML("<h2>Missing Values</h2>"))
            missing_values = df.isnull().sum()
            missing_values = missing_values[missing_values > 0]
            display(HTML(missing_values.to_frame().to_html()))
        children.append(missing_values_output)
        titles.append("Missing Values")

    # Basic statistics
    if show_statistics:
        basic_stats_output = widgets.Output()
        with basic_stats_output:
            display(HTML("<h2>Basic Statistics</h2>"))
            display(HTML(df.describe().T.to_html()))
        children.append(basic_stats_output)
        titles.append("Statistics")

    # Categorical and numerical features
    if show_features:
        features_output = widgets.Output()
        with features_output:
            display(HTML("<h2>Features</h2>"))
            display(HTML(f"<h3>Categorical Features: {categorical_features.tolist()}</h3>"))
            display(HTML(f"<h3>Numerical Features: {numerical_features.tolist()}</h3>"))
        children.append(features_output)
        titles.append("Features")

    # Distributions for numerical features
    if show_distributions:
        distributions_output = widgets.Output()
        with distributions_output:
            display(HTML("<h2>Numerical Features</h2>"))
            for feature in numerical_features:
                plt.figure(figsize=(10, 5))
                sns.histplot(df[feature], kde=True)
                plt.title(f'Distribution of {feature}')
                buf = BytesIO()
                plt.savefig(buf, format='png')
                buf.seek(0)
                image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
                display(HTML(image_html))
                plt.close()
        children.append(distributions_output)
        titles.append("Numerical Features")

    # Bar charts for categorical features
    if show_bar_charts:
        bar_charts_output = widgets.Output()
        with bar_charts_output:
            display(HTML("<h2>Categorical Features</h2>"))
            for feature in categorical_features:
                plt.figure(figsize=(10, 5))
                sns.countplot(y=df[feature])
                plt.title(f'Count of {feature}')
                buf = BytesIO()
                plt.savefig(buf, format='png')
                buf.seek(0)
                image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
                display(HTML(image_html))
                plt.close()
        children.append(bar_charts_output)
        titles.append("Categorical Features")

    # Correlation heatmap for numerical features
    if show_correlation and len(numerical_features) > 1:
        correlation_output = widgets.Output()
        with correlation_output:
            display(HTML("<h2>Heatmap</h2>"))
            plt.figure(figsize=(12, 8))
            correlation_matrix = df[numerical_features].corr()
            sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
            plt.title('Correlation Heatmap')
            buf = BytesIO()
            plt.savefig(buf, format='png')
            buf.seek(0)
            image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
            display(HTML(image_html))
            plt.close()
        children.append(correlation_output)
        titles.append("Heatmap")

    # Pairplot for numerical features
    if show_pairplot and len(numerical_features) > 1:
        pairplot_output = widgets.Output()
        with pairplot_output:
            display(HTML("<h2>Pairplot for Numerical Features</h2>"))
            sample_df = df[numerical_features].sample(n=min(500, len(df)), random_state=42)  # Limit to 500 samples for performance
            pairplot = sns.pairplot(sample_df)
            buf = BytesIO()
            pairplot.savefig(buf, format='png')
            buf.seek(0)
            image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
            display(HTML(image_html))
            plt.close()
        children.append(pairplot_output)
        titles.append("Pairplot")

    # Value counts for categorical features
    if show_value_counts:
        value_counts_output = widgets.Output()
        with value_counts_output:
            display(HTML("<h2>Value Counts for Categorical Features</h2>"))
            for feature in categorical_features:
                value_counts_html = f"<h3>Value Counts for {feature}</h3>"
                value_counts = df[feature].value_counts()
                value_counts_html += value_counts.to_frame().to_html()
                display(HTML(value_counts_html))
        children.append(value_counts_output)
        titles.append("Value Counts")

    # Target variable analysis
    if target_variable and target_variable in df.columns:
        target_analysis_output = widgets.Output()
        with target_analysis_output:
            display(HTML(f"<h2>Target Variable Analysis: {target_variable}</h2>"))

            # Distribution of the target variable
            plt.figure(figsize=(10, 5))
            sns.histplot(df[target_variable], kde=True)
            plt.title(f'Distribution of {target_variable}')
            buf = BytesIO()
            plt.savefig(buf, format='png')
            buf.seek(0)
            image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
            display(HTML(image_html))
            plt.close()

            # Correlation with target variable if it's numeric
            if df[target_variable].dtype in ['int64', 'float64']:
                display(HTML("<h3>Correlation with Target Variable</h3>"))
                corr_with_target = df[numerical_features].corrwith(df[target_variable]).sort_values(ascending=False)
                display(HTML(corr_with_target.to_frame().to_html()))

            # Relationship between numerical features and target variable
            display(HTML("<h3>Relationship between Numerical Features and Target Variable</h3>"))
            for feature in numerical_features:
                plt.figure(figsize=(10, 5))
                sns.boxplot(x=target_variable, y=feature, data=df)
                plt.title(f'{feature} vs {target_variable}')
                buf = BytesIO()
                plt.savefig(buf, format='png')
                buf.seek(0)
                image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
                display(HTML(image_html))
                plt.close()

            # Relationship between categorical features and target variable
            display(HTML("<h3>Relationship between Categorical Features and Target Variable</h3>"))
            for feature in categorical_features:
                plt.figure(figsize=(10, 5))
                sns.countplot(x=target_variable, hue=feature, data=df)
                plt.title(f'{feature} vs {target_variable}')
                buf = BytesIO()
                plt.savefig(buf, format='png')
                buf.seek(0)
                image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
                display(HTML(image_html))
                plt.close()

            # Pairplot for target variable analysis
            if show_pairplot:
                display(HTML("<h3>Pairplot for Target Variable Analysis</h3>"))
                sample_df = df[[target_variable] + numerical_features.tolist()].sample(n=min(500, len(df)), random_state=42)  # Limit to 500 samples for performance
                pairplot = sns.pairplot(sample_df, hue=target_variable)
                buf = BytesIO()
                pairplot.savefig(buf, format='png')
                buf.seek(0)
                image_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.read()).decode()}" />'
                display(HTML(image_html))
                plt.close()

        children.append(target_analysis_output)
        titles.append("Target Analysis")

    # Setting up tabs
    tabs.children = children

    for i, title in enumerate(titles):
        tabs.set_title(i, title)

    display(tabs)

In [2]:
setup_working_directory()
gather_dataframes()

Folder paths: {'Data': 'c:\\Users\\carle\\Desktop\\5_projects\\projects_machinelearning\\ML_TelecomChurn\\Data'}
Dataframes gathered: ['telecom_churn_PREDICT', 'telecom_churn_TRAINTEST', 'variable_dictionary']


In [3]:
df_traintest = telecom_churn_TRAINTEST
df_predict = telecom_churn_PREDICT

In [9]:
display_preliminary_info(df_traintest, target_variable='Churn', sample_size=5000, show_pairplot=False)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output()), selec…