## Load the Code

### Imports and Paths

In [13]:
import chardet
import os
import pandas as pd
import numpy as np
import sys
# sys.path.append(r'C:\Users\Windows\Dropbox\James\Python\01_Media Archive Scripts')
from dataframe_handler import DataFrameHandler
read_file = r'C:\Users\Windows\Downloads\archive (7)\netflix1.csv'
from IPython.display import display

### Set Pandas Display Options

In [14]:
# Set options to display more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows (if applicable)
pd.set_option('display.max_colwidth', None) # Show full column width
pd.set_option('display.width', 1000)        # Set the width of the display

### Load Functions

#### Load to dataframe

In [15]:
def detect_encoding(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file at {file_path} does not exist.")
    
    try:
        with open(file_path, 'rb') as file:
            result = chardet.detect(file.read(10000))
        return result['encoding']
    except IOError as e:
        raise IOError(f"An error occurred while reading the file: {e}")

def detect_delimiter(file_path, sample_size=1000):
    delimiters = [',', '\t', ';', '|', ' ']
    delimiter_counts = {delim: 0 for delim in delimiters}
    
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        sample = file.read(sample_size)
    
    for delim in delimiters:
        delimiter_counts[delim] = sample.count(delim)
    
    # Handle the case where multiple delimiters have similar counts
    most_common_delim = max(delimiter_counts, key=delimiter_counts.get)
    
    return most_common_delim

def read_file(file_path, encoding='utf-8', sheet_name=None, force_plain_text=False):
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()[1:]
    
    try:
        if file_extension in ['csv', 'tsv', 'txt', 'log']:
            if force_plain_text:
                print(f"Loading {file_path} as plain text")
                with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
                    content = file.read()
                return pd.DataFrame({'content': [content]})
            else:
                delimiter = detect_delimiter(file_path)
                if delimiter:
                    print(f"Detected delimiter for {file_path}: '{delimiter}'")
                    return pd.read_csv(file_path, delimiter=delimiter, encoding=encoding, on_bad_lines='skip')
                else:
                    print(f"No clear delimiter detected for {file_path}, reading as single column")
                    return pd.read_csv(file_path, delimiter='\n', encoding=encoding, header=None, names=['content'])
        elif file_extension in ['xls', 'xlsx']:
            return pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
        elif file_extension == 'json':
            return pd.read_json(file_path, encoding=encoding)
        elif file_extension == 'parquet':
            return pd.read_parquet(file_path)
        elif file_extension == 'hdf':
            return pd.read_hdf(file_path)
        elif file_extension == 'feather':
            return pd.read_feather(file_path)
        else:
            raise ValueError("Unsupported file type.")
    except (UnicodeDecodeError, pd.errors.ParserError) as e:
        detected_encoding = detect_encoding(file_path)
        try:
            if file_extension in ['csv', 'json', 'txt', 'log']:
                if force_plain_text:
                    print(f"Reattempting with detected encoding '{detected_encoding}' as plain text")
                    with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as file:
                        content = file.read()
                    return pd.DataFrame({'content': [content]})
                else:
                    delimiter = detect_delimiter(file_path)
                    if delimiter:
                        print(f"Reattempting with detected encoding '{detected_encoding}' and delimiter '{delimiter}'")
                        return pd.read_csv(file_path, delimiter=delimiter, encoding=detected_encoding, on_bad_lines='skip')
                    else:
                        print(f"Reattempting with detected encoding '{detected_encoding}' and single column")
                        return pd.read_csv(file_path, delimiter='\n', encoding=detected_encoding, header=None, names=['content'])
            else:
                raise RuntimeError("Unsupported file type for encoding detection.")
        except Exception as e:
            raise RuntimeError(f"An error occurred while loading the file: {e}")

def load_to_dataframe(file_or_folder_path, file_type=None, sheet_name=None, force_plain_text=False):
    if os.path.isdir(file_or_folder_path):
        if file_type is None:
            raise ValueError("File type must be specified when a folder is provided.")
        
        dfs = []
        columns_set = set()  # To keep track of all columns across files
        
        for root, _, files in os.walk(file_or_folder_path):
            for file in files:
                if file.lower().endswith(file_type.lower()):
                    file_path = os.path.join(root, file)
                    print(f"Processing file: {file_path}")
                    try:
                        df = read_file(file_path, sheet_name=sheet_name, force_plain_text=force_plain_text)
                        columns_set.update(df.columns)
                        dfs.append(df)
                    except Exception as e:
                        print(f"Warning: Could not process file {file_path}: {e}")
                        try:
                            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                                content = file.read()
                            df = pd.DataFrame({'content': [content]})
                            dfs.append(df)
                        except Exception as fallback_e:
                            print(f"Fallback failed for file {file_path}: {fallback_e}")

        if not dfs:
            raise ValueError("No valid files found in the specified folder.")
        
        # Standardize columns across all DataFrames
        standardized_dfs = []
        for df in dfs:
            # Add missing columns
            missing_cols = columns_set - set(df.columns)
            for col in missing_cols:
                df[col] = pd.NA
            # Reorder columns
            df = df[list(columns_set)]
            standardized_dfs.append(df)

        combined_df = pd.concat(standardized_dfs, ignore_index=True)
        return combined_df

    elif os.path.isfile(file_or_folder_path):
        return read_file(file_or_folder_path, sheet_name=sheet_name, force_plain_text=force_plain_text)
    
    else:
        raise ValueError(f"The path {file_or_folder_path} is neither a file nor a folder.")

#### Open in Excel

In [16]:
def open_csv_in_excel(file_path):
    """
    Opens a CSV file in Microsoft Excel.

    This function attempts to find the Microsoft Excel executable (`EXCEL.EXE`) on the system
    and uses it to open the specified CSV file. It first checks common installation paths for
    Excel and then searches additional directories if necessary.

    Args:
        file_path (str): The path to the CSV file that needs to be opened in Excel.

    Returns:
        None

    Raises:
        FileNotFoundError: If the file_path does not point to a valid file.
        RuntimeError: If Excel cannot be found on the system or if there is an issue opening the file.

    Notes:
        - This function assumes that Microsoft Excel is installed on the system.
        - The `find_excel_exe` function is used to search for Excel in common installation directories.
        - The function prints an error message if Excel cannot be found or if there is an issue opening the file.
    """

    excel_paths = [r"C:\Program Files (x86)\Microsoft Office\root\Office16\EXCEL.EXE", r"C:\Program Files\Microsoft Office\root\Office16\EXCEL.EXE"]
    
    excel_program_path = None

    for excel_path in excel_paths:

        # Check if EXCEL.EXE exists in excel_paths
        if os.path.exists(excel_path):
            excel_program_path = excel_path

    if not excel_program_path:
        # Directories to search for EXCEL.EXE
        directories_to_search = [
            r"C:\Program Files",
            r"C:\Program Files (x86)",
            r"C:\\",
            # Add more directories to search if needed
        ]

        excel_program_path = find_excel_exe(directories_to_search)

    if excel_program_path:
        subprocess.Popen([excel_program_path, file_path])
    else:
        print("Sorry. Cannot open file directly. Excel cannot be found")

def find_excel_exe(directories):
    for directory in directories:
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.lower() == 'excel.exe':
                    return os.path.join(root, file)
    return None

def open_in_excel(filename):
    if os.path.exists(filename):
        print(f"Report written to: {filename}")
        open_csv = input(f"Open file in Excel? Y/N")
        if open_csv.lower() == "y":
            open_csv_in_excel(filename)
    else:
        print("Error writing report.")

#### Dataframe Summaries

##### Overall Summary

In [17]:
def overall_summary(df: pd.DataFrame) -> pd.DataFrame:
    """
    Get a comprehensive summary of the DataFrame including overall metrics.

    Parameters:
        df (pd.DataFrame): The DataFrame to summarize.

    Returns:
        pd.DataFrame: Summary including overall metrics about the DataFrame.
    """
    if df is None:
        raise ValueError("DataFrame is not set.")

    # Compute overall metrics
    total_rows = df.shape[0]
    total_columns = df.shape[1]
    total_memory_kb = (df.memory_usage(deep=True).sum() / 1024).round(1)
    total_missing_values = df.isna().sum().sum()
    total_duplicates = df.duplicated().sum()
    total_unique_values = df.nunique().sum()
    total_non_null_values = df.notna().sum().sum()
    data_types_count = df.dtypes.value_counts()

    # Format the metrics for readability
    formatted_total_rows = f"{total_rows:,}"
    formatted_total_columns = f"{total_columns:,}"
    formatted_total_memory_kb = f"{total_memory_kb:,.1f}"
    formatted_total_missing_values = f"{total_missing_values:,}"
    formatted_total_duplicates = f"{total_duplicates:,}"
    formatted_total_unique_values = f"{total_unique_values:,}"
    formatted_total_non_null_values = f"{total_non_null_values:,}"

    # Format the data types count for human readability
    data_types_summary = "\n".join([f"{dtype}: {count:,}" for dtype, count in data_types_count.items()])

    # Create overall summary DataFrame
    overall_summary_df = pd.DataFrame({
        'Metric': ['Total Rows', 'Total Columns', 'Total Memory Usage (KB)', 'Total Missing Values', 
                   'Total Duplicates', 'Total Unique Values', 'Total Non-Null Values', 'Data Types Count'],
        'Value': [formatted_total_rows, formatted_total_columns, formatted_total_memory_kb, formatted_total_missing_values, 
                  formatted_total_duplicates, formatted_total_unique_values, formatted_total_non_null_values, data_types_summary]
    })

    return overall_summary_df

##### Column Summaries

In [18]:
def data_types_summary(df: pd.DataFrame) -> pd.DataFrame:
    """
    Get the data types of each column along with additional metrics and return them as a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame for which to generate the data types summary.

    Returns:
        pd.DataFrame: Data types and various metrics for each column.
    """
    if df is None:
        raise ValueError("DataFrame is not set.")

    # Initialize the summary DataFrame
    summary_dict = {
        'Column Name': df.columns,
        'Data Type': df.dtypes,
        'Non-Null Count': df.notna().sum(),
        'Null Count': df.isna().sum(),
        'Total Count': [df.shape[0]] * len(df.columns),  # Ensure same length
        'Percentage Non-Null': (df.notna().sum() / df.shape[0] * 100).round(2),
        'Distinct Count': df.nunique(),
        'Memory Usage (KB)': (df.memory_usage(deep=True, index=False) / 1024).round(1)  # Round to 1 decimal place
    }

    # Add Most Frequent Value (Mode)
    def get_most_frequent(column: pd.Series):
        try:
            mode = column.mode()
            return mode.iloc[0] if not mode.empty else None
        except:
            return None

    summary_dict['Most Frequent Value'] = df.apply(get_most_frequent)

    # Add Unique Count (number of values that appear only once)
    def unique_count(column: pd.Series) -> int:
        return (column.value_counts() == 1).sum()
    
    summary_dict['Unique Count'] = df.apply(unique_count)

    # Add Empty String Count for string columns
    def empty_string_count(column: pd.Series) -> int:
        if pd.api.types.is_string_dtype(column):
            return column.str.strip().eq('').sum()
        return 0

    summary_dict['Empty String Count'] = df.apply(empty_string_count)

    # Create DataFrame from dictionary
    summary = pd.DataFrame(summary_dict)

    # Ensure the DataFrame's index length matches the number of columns
    assert len(summary) == len(df.columns), "Mismatch in DataFrame column lengths."

    return summary

##### Column Details

In [19]:
def numeric_metrics(series: pd.Series) -> pd.Series:
    """
    Calculate various metrics for a numeric series.

    Args:
        series (pd.Series): The numeric series to analyze.

    Returns:
        pd.Series: Various calculated metrics for the series.
    """
    # Calculate basic statistics
    mean = series.mean()
    median = series.median()
    min_val = series.min()
    max_val = series.max()
    range_val = max_val - min_val
    total_sum = series.sum()
    
    # Calculate dispersion metrics
    std_dev = series.std()
    variance = series.var()
    abs_mean_dev = (series - mean).abs().mean()
    cv = std_dev / mean if mean != 0 else float('inf')
    
    # Calculate quartiles and IQR
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    
    # Calculate outlier bounds
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    # Calculate mode(s)
    mode_values = series.mode()
    mode_value = mode_values[0] if not mode_values.empty else np.nan
    
    # Calculate additional metrics
    skewness = series.skew()
    kurtosis = series.kurtosis()
    num_outliers = ((series < lower_bound) | (series > upper_bound)).sum()
    missing_values = series.isna().sum()
    unique_values = series.nunique()

    # Format the metrics with thousands separators and up to two decimal places
    def format_number(value):
        if pd.isna(value):
            return 'NaN'
        return f"{value:,.2f}"
    
    # Compile all metrics into a pandas Series with logical grouping
    return pd.Series({
        # Basic Statistics
        'Mean': format_number(mean),
        'Median': format_number(median),
        'Min': format_number(min_val),
        'Max': format_number(max_val),
        'Range': format_number(range_val),
        'Sum': format_number(total_sum),

        # Dispersion
        'Std Dev': format_number(std_dev),
        'Variance': format_number(variance),
        'Absolute Mean Deviation': format_number(abs_mean_dev),
        'Coefficient of Variation (CV)': format_number(cv),

        # Distribution
        'Skewness': format_number(skewness),
        'Kurtosis': format_number(kurtosis),

        # Quartiles and IQR
        '1st Quartile (Q1)': format_number(q1),
        '3rd Quartile (Q3)': format_number(q3),
        'Interquartile Range (IQR)': format_number(iqr),

        # Outliers
        'Lower Bound for Outliers': format_number(lower_bound),
        'Upper Bound for Outliers': format_number(upper_bound),
        'Number of Outliers': num_outliers,

        # Other
        'Mode': mode_value,
        'Missing Values': missing_values,
        'Unique Values': unique_values
    })


import pandas as pd

def datetime_metrics(series: pd.Series) -> pd.Series:
    """
    Calculate various metrics for a datetime series.

    Args:
        series (pd.Series): The datetime series to analyze.

    Returns:
        pd.Series: Various calculated metrics for the series.
    """
    # Convert series to datetime format, if not already
    series = pd.to_datetime(series, errors='coerce')
    
    # Calculate basic statistics
    min_date = series.min()
    max_date = series.max()
    mean_date = series.mean()
    median_date = series.median()

    # Convert datetime to string format for reporting
    min_date_str = pd.Timestamp(min_date).strftime('%Y-%m-%d') if pd.notna(min_date) else None
    max_date_str = pd.Timestamp(max_date).strftime('%Y-%m-%d') if pd.notna(max_date) else None
    mean_date_str = pd.Timestamp(mean_date).strftime('%Y-%m-%d') if pd.notna(mean_date) else None
    median_date_str = pd.Timestamp(median_date).strftime('%Y-%m-%d') if pd.notna(median_date) else None
    
    date_range = (max_date - min_date).days if pd.notna(max_date) and pd.notna(min_date) else None
    total_days = date_range  # Same as date_range in this context
    missing_values = series.isna().sum()
    unique_values = series.nunique()
    most_common_date = series.mode().values[0] if not series.mode().empty else None
    
    # Convert most_common_date to string format
    most_common_date_str = pd.Timestamp(most_common_date).strftime('%Y-%m-%d') if pd.notna(most_common_date) else None

    # Return all metrics as a pandas Series
    return pd.Series({
        'Min Date': min_date_str,
        'Max Date': max_date_str,
        'Mean Date': mean_date_str,
        'Median Date': median_date_str,
        'Date Range (Days)': date_range,
        'Total Days': total_days,
        'Missing Values': missing_values,
        'Unique Values': unique_values,
        'Most Common Date': most_common_date_str
    })

import pandas as pd

def string_metrics(series: pd.Series) -> pd.Series:
    """
    Calculate various metrics for a string series.

    Args:
        series (pd.Series): The string series to analyze.

    Returns:
        pd.Series: Various calculated metrics for the series.
    """
    # Clean the series
    series = series.fillna('')
    
    # Calculate basic string statistics
    most_frequent = series.mode()[0] if not series.mode().empty else np.nan
    unique_values = series.nunique()
    missing_values = series.isna().sum()
    empty_strings = (series == '').sum()
    
    # String length statistics
    lengths = series.apply(len)
    longest_string = lengths.max()
    shortest_string = lengths.min()
    average_length = lengths.mean()
    
    # Character frequency (top 10 characters)
    char_freq = pd.Series(''.join(series).replace(' ', '')).value_counts().head(10)
    
    # Word frequency (top 10 words, if applicable)
    words = series.str.split(expand=True).stack()
    word_freq = words.value_counts().head(10)
    
    # Return all metrics as a pandas Series
    return pd.Series({
        'Most Frequent': most_frequent,
        'Unique Values': unique_values,
        'Missing Values': missing_values,
        'Empty Strings': empty_strings,
        'Longest String Length': longest_string,
        'Shortest String Length': shortest_string,
        'Average String Length': average_length,
        # 'Top 10 Characters': char_freq.to_dict(),
        'Top 10 Words': word_freq.to_dict()
    })

import pandas as pd

def boolean_metrics(series: pd.Series) -> pd.Series:
    """
    Calculate metrics for a boolean series.

    Args:
        series (pd.Series): The boolean series to analyze.

    Returns:
        pd.Series: Various calculated metrics for the series.
    """
    return pd.Series({
        'True Count': (series == True).sum(),
        'False Count': (series == False).sum(),
        'Missing Values': series.isna().sum()
    })

import pandas as pd

def data_type_summaries(df: pd.DataFrame) -> dict:
    """
    Get comprehensive metrics for each data type in the DataFrame and return them as a dictionary of DataFrames.

    Args:
        df (pd.DataFrame): The DataFrame for which to generate the summaries.

    Returns:
        dict: A dictionary with data type as the key and DataFrame of metrics as the value.
    """
    if df is None:
        raise ValueError("DataFrame is not set.")

    # Dictionary to hold DataFrames for each data type
    summaries = {}

    # Metrics calculation functions for each data type
    metrics_funcs = {
        np.number: numeric_metrics,
        'datetime64[ns]': datetime_metrics,
        'object': string_metrics,
        'bool': boolean_metrics
    }

    # Calculate and store metrics for each data type
    for dtype, metrics_func in metrics_funcs.items():
        type_columns = df.select_dtypes(include=[dtype])
        summary_list = []

        for column in type_columns:
            metrics = metrics_func(df[column])
            metrics.name = column
            summary_list.append(pd.DataFrame(metrics).T)

        if summary_list:
            summary_df = pd.concat(summary_list)
            summary_df.reset_index(inplace=True)
            summary_df.rename(columns={'index': 'Column Name'}, inplace=True)
            summary_df = summary_df.sort_values(by='Column Name').reset_index(drop=True)
            summaries[str(dtype)] = summary_df

    # Display DataFrames in a Jupyter Notebook
    for dtype, summary_df in summaries.items():
        print(f"\nData Type: {dtype}")
        if get_ipython() is not None:  # Check if running in Jupyter Notebook
            display(summary_df)
        else:
            print(summary_df.to_string(index=False))  # Print DataFrame as string

    return summaries



#### Dataframe Correlation

In [20]:
def correlation_function(df: pd.DataFrame, dummy_variables: bool = False, dummy_threshold: int = 10, report_threshold: float = 0.3) -> pd.DataFrame:
    """
    Calculate the correlation between columns in a DataFrame, optionally creating dummy variables for categorical columns.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - dummy_variables (bool): If True, create dummy variables for string columns with unique values <= dummy_threshold.
    - dummy_threshold (int): Maximum number of unique values in a column to create dummy variables.
    - report_threshold (float): Threshold for reporting correlated pairs (both positive and negative).
    
    Returns:
    - pd.DataFrame: DataFrame with columns of the correlated pairs, their correlation values.
    """
    
    # Step 1: Convert string/categorical columns to dummy variables if needed
    df_copy = df.copy()
    
    if dummy_variables:
        # Identify string/categorical columns
        non_numeric_cols = df_copy.select_dtypes(include=['object', 'category']).columns
        
        for col in non_numeric_cols:
            unique_vals = df_copy[col].nunique()
            if unique_vals <= dummy_threshold:
                # Convert to dummy variables
                df_copy = pd.get_dummies(df_copy, columns=[col], drop_first=True)
    
    # Step 2: Identify numeric columns (ignoring non-numeric columns)
    numeric_df = df_copy.select_dtypes(include=[np.number])
    
    if numeric_df.empty:
        raise ValueError("No numeric columns found in the DataFrame for correlation analysis.")
    
    # Step 3: Calculate correlation matrix
    corr_matrix = numeric_df.corr()

    # Step 4: Extract correlation pairs greater than the report_threshold
    corr_pairs = corr_matrix.unstack().reset_index()
    corr_pairs.columns = ['Variable1', 'Variable2', 'Correlation']
    
    # Remove self-correlations
    corr_pairs = corr_pairs[corr_pairs['Variable1'] != corr_pairs['Variable2']]
    
    # Ensure we only report each pair once (e.g., always sort Variable1 and Variable2 alphabetically)
    corr_pairs['sorted_pair'] = corr_pairs.apply(lambda x: tuple(sorted([x['Variable1'], x['Variable2']])), axis=1)
    corr_pairs = corr_pairs.drop_duplicates(subset='sorted_pair').drop(columns='sorted_pair')

    # Step 5: Filter pairs based on the report_threshold
    corr_pairs['Abs_Correlation'] = corr_pairs['Correlation'].abs()
    filtered_pairs = corr_pairs[corr_pairs['Abs_Correlation'] >= report_threshold]
    
    # Sort by absolute correlation in descending order
    filtered_pairs = filtered_pairs.sort_values(by='Abs_Correlation', ascending=False).drop(columns='Abs_Correlation')

    return filtered_pairs.reset_index(drop=True)

#### Misc Support Functions

In [21]:
# Load a 2-column csv file as a pandas dataframe and return a dictionary 

def load_csv_to_dict(file_path, key_column, value_column):
    """
    Load a CSV file and create a dictionary from specified key and value columns.

    Parameters:
    file_path (str): The path to the CSV file.
    key_column (str): The name of the column to use as keys in the dictionary.
    value_column (str): The name of the column to use as values in the dictionary.

    Returns:
    dict: A dictionary with keys from the key_column and values from the value_column.
    """
    df = pd.read_csv(file_path)
    
    # Convert NaN values to empty strings
    df[value_column] = df[value_column].apply(lambda x: '' if pd.isna(x) else str(x))
    
    return df.set_index(key_column)[value_column].to_dict()

def ensure_string(text, string_length_threshold):
    """
    Ensure the input is converted to a string representation.

    Parameters:
    text: The input to be checked and converted to a string if necessary.
    string_length_threshold: The minimum length of the string. If the length of the string is less than this threshold, return an empty string.

    Returns:
    str: The input converted to a string or an empty string if the length is less than the threshold or conversion fails.
    """
    if pd.isnull(text):
        return ""
    elif isinstance(text, str):
        text = text.strip()  # Strip whitespace from both ends
        if len(text) < string_length_threshold:
            return ""
        return text
    
    try:
        # Attempt to convert non-string types to string
        text = str(text).strip()
        if len(text) < string_length_threshold:
            return ""
        return text
    except (ValueError, TypeError):
        return ""
    
def clean_whitespace(text):
    # Remove leading and trailing whitespace
    cleaned_text = text.strip()
    
    # Replace multiple spaces with a single space
    cleaned_text = ' '.join(cleaned_text.split())
    
    return cleaned_text


def save_dataframe_with_incremented_filename(file_path):
    """
    Check if a filename already exists. If it does, it returns an incremented filename.

    Parameters:
    df (pd.DataFrame): The DataFrame to be saved.
    file_path (str): The initial file path for the CSV file.
    """
    base, extension = os.path.splitext(file_path)
    counter = 1

    # Check if the file already exists
    while os.path.exists(file_path):
        # Increment the file name
        file_path = f"{base}_{counter}{extension}"
        counter += 1

    return file_path

## Read Data

In [None]:
# Enter the file (or folder) with the data to load into a dataframe
# read_file_path = r'C:\Users\Windows\Downloads\archive (7)\netflix1.csv'
read_file_path = r'C:\Users\Windows\Downloads\happyscore_income.csv'
df = load_to_dataframe(read_file_path)

### Read a Single File

Force a file to load as text with force_plain_text=True </br>
When loading an Excel file, use sheet_name="Sheet1" to specify the sheet name to load </br>

 - df = load_to_dataframe("path/to/file.csv", file_type="csv")
 - df = load_to_dataframe("path/to/file.txt", file_type="txt", force_plain_text=True)
 - df = load_to_dataframe("path/to/file.xlsx", file_type="xlsx", sheet_name="Sheet1")
 - df = load_to_dataframe("path/to/file.json", file_type="json")
 - df = load_to_dataframe("path/to/file.parquet", file_type="parquet")
 - df = load_to_dataframe("path/to/file.hdf", file_type="hdf")
 - df = load_to_dataframe("path/to/file.feather", file_type="feather")

### Read Multiple Files
 - df = load_to_dataframe("path/to/folder", file_type="csv")
 - df = load_to_dataframe("path/to/folder", file_type="log", force_plain_text=True)

## Summarize Data

### Dataframe Summary

In [None]:
summary_df = overall_summary(df)
print(summary_df)

### Column Summaries

In [None]:
# Get the summary
summary_df = data_types_summary(df)
print(summary_df)

### Column Details

In [None]:
# Run the function
summaries = data_type_summaries(df)

# Correlation

In [None]:
result = correlation_function(df, dummy_variables=True, dummy_threshold=100, report_threshold=0)
print(result)