In [22]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML, display_html
import ipywidgets as widgets

In [30]:
def setup_working_directory():
    """
    Sets up the working directory by detecting the current working directory
    and listing the folders in it. Creates global variables for each folder.

    Returns:
    None
    """
    current_dir = os.getcwd()
    global folder_paths
    folder_paths = {folder: os.path.join(current_dir, folder) for folder in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, folder))}
    
    for folder_name, folder_path in folder_paths.items():
        globals()[folder_name] = folder_path
    
    # Print the folder paths
    print("Folder paths:", folder_paths)

def gather_dataframes():
    """
    Gathers dataframes from all files in the specified folders.
    Creates global variables for each dataframe.

    Returns:
    None
    """
    gathered_dataframes = []
    for folder_name, folder_path in folder_paths.items():
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if file.endswith('.csv'):
                df_name = os.path.splitext(file)[0]
                globals()[df_name] = pd.read_csv(file_path)
                gathered_dataframes.append(df_name)
            elif file.endswith('.xlsx'):
                df_name = os.path.splitext(file)[0]
                globals()[df_name] = pd.read_excel(file_path)
                gathered_dataframes.append(df_name)
            else:
                raise ValueError(f"Unsupported file format for {file_path}.")
    
    # Print the gathered dataframes
    print("Dataframes gathered:", gathered_dataframes)



def preliminary_data_dashboard(df):
    """
    Display a preliminary analysis of a DataFrame including:
    - First few rows of the DataFrame
    - Information about the DataFrame
    - Total count of null values in each column
    - Check for duplicate rows
    - Basic descriptive statistics
    
    Parameters:
    df (pd.DataFrame): The DataFrame to be analyzed.
    """
    # Create tabs
    tab = widgets.Tab()
    tab_contents = ['Basic Info', 'Head', 'Missing Values', 'Duplicates', 'Descriptive Stats']
    children = [widgets.Output() for _ in tab_contents]
    tab.children = children
    for i, title in enumerate(tab_contents):
        tab.set_title(i, title)
    
    # Basic Info
    with children[0]:
        print("Basic Information")
        buffer = []
        df.info(buf=buffer)
        info_str = ''.join(buffer)
        display(HTML(f"<pre>{info_str}</pre>"))
    
    # Head
    with children[1]:
        print("Head of the DataFrame:")
        display(df.head())
    
    # Missing Values
    with children[2]:
        print("Total null values in each column:")
        nulls = df.isnull().sum().to_frame('Total Nulls').T
        display(nulls)
        
        # Visualization for null values
        plt.figure(figsize=(10, 6))
        df.isnull().sum().plot(kind='bar')
        plt.title('Total Null Values in Each Column')
        plt.xlabel('Columns')
        plt.ylabel('Total Null Values')
        plt.xticks(rotation=45)
        plt.show()
    
    # Duplicates
    with children[3]:
        print("Checking for duplicates...")
        duplicate_rows = df[df.duplicated()]
        if duplicate_rows.empty:
            display(HTML("<p>No duplicate rows found.</p>"))
        else:
            display(HTML(f"<p>Found {len(duplicate_rows)} duplicate rows:</p>"))
            display(duplicate_rows)
    
    # Descriptive Statistics
    with children[4]:
        print("Basic descriptive statistics:")
        display(df.describe())
    
    display(tab)

In [26]:
setup_working_directory()
gather_dataframes()

Folder paths: {'Data': 'c:\\Users\\carle\\Desktop\\5_projects\\projects_machinelearning\\ML_KaggleBenchmark\\Data'}
Dataframes gathered: ['ts_kaggle_train']


In [27]:
df = ts_kaggle_train

In [31]:
preliminary_data(df)

Head of the DataFrame:
   Unnamed: 0        date unique_id  city_id  shop_id  item_category_id  \
0           0  2013-01-31  12_11365       16       12                 9   
1           1  2013-01-31  12_11369       16       12                 9   
2           2  2013-01-31  12_11370       16       12                 9   
3           3  2013-01-31  12_11373       16       12                 9   
4           4  2013-01-31  12_12231       16       12                49   

   item_id  monthly_average_price  monthly_sales  
0    11365            1511.512626           30.0  
1    11369             572.715278           30.0  
2    11370             699.776786           30.0  
3    11373             548.900000            6.0  
4    12231             350.000000            3.0   

Info of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14484 entries, 0 to 14483
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 -------