# Data Loading and Setup
Import essential libraries (pandas, numpy, matplotlib) and set up basic configurations.

In [6]:
%pip install pandas matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Downloading matplotlib-3.10.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Downloading contourpy-

In [7]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set up basic configurations
# Configure pandas to display more rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

# Configure matplotlib for inline plotting
%matplotlib inline

# Set a random seed for reproducibility
np.random.seed(42)

# Data Cleaning Operations
Define functions for common data cleaning tasks like handling missing values and data type conversions.

In [8]:
# Define a function to handle missing values
def handle_missing_values(df, strategy='mean', fill_value=None):
    """
    Handles missing values in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        strategy (str): The strategy to handle missing values. Options are 'mean', 'median', 'mode', or 'constant'.
        fill_value: The value to use if strategy is 'constant'.
    
    Returns:
        pd.DataFrame: The DataFrame with missing values handled.
    """
    if strategy == 'mean':
        return df.fillna(df.mean())
    elif strategy == 'median':
        return df.fillna(df.median())
    elif strategy == 'mode':
        return df.fillna(df.mode().iloc[0])
    elif strategy == 'constant':
        if fill_value is None:
            raise ValueError("fill_value must be provided when strategy is 'constant'")
        return df.fillna(fill_value)
    else:
        raise ValueError("Invalid strategy. Choose from 'mean', 'median', 'mode', or 'constant'.")

# Define a function to convert data types
def convert_data_types(df, column_types):
    """
    Converts the data types of specified columns in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        column_types (dict): A dictionary where keys are column names and values are target data types.
    
    Returns:
        pd.DataFrame: The DataFrame with updated data types.
    """
    for column, dtype in column_types.items():
        df[column] = df[column].astype(dtype)
    return df

# Example usage (commented out for now)
# df = pd.DataFrame({
#     'A': [1, 2, np.nan, 4],
#     'B': ['1', '2', '3', '4']
# })
# df = handle_missing_values(df, strategy='mean')
# df = convert_data_types(df, {'B': 'int'})

# Data Analysis Functions
Create utility functions for basic statistical analysis and data manipulation.

In [9]:
# Define a function to calculate basic statistics
def calculate_statistics(df, columns=None):
    """
    Calculates basic statistics (mean, median, standard deviation) for specified columns in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to analyze.
        columns (list): List of column names to calculate statistics for. If None, all numeric columns are used.
    
    Returns:
        pd.DataFrame: A DataFrame containing the statistics for each column.
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    stats = {
        'mean': df[columns].mean(),
        'median': df[columns].median(),
        'std_dev': df[columns].std()
    }
    return pd.DataFrame(stats)

# Define a function to normalize data
def normalize_data(df, columns=None):
    """
    Normalizes specified columns in a DataFrame to have values between 0 and 1.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        columns (list): List of column names to normalize. If None, all numeric columns are normalized.
    
    Returns:
        pd.DataFrame: The DataFrame with normalized columns.
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    df_normalized = df.copy()
    for column in columns:
        min_val = df[column].min()
        max_val = df[column].max()
        df_normalized[column] = (df[column] - min_val) / (max_val - min_val)
    return df_normalized

# Define a function to filter data based on conditions
def filter_data(df, conditions):
    """
    Filters rows in a DataFrame based on specified conditions.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to filter.
        conditions (dict): A dictionary where keys are column names and values are conditions (e.g., lambda functions).
    
    Returns:
        pd.DataFrame: The filtered DataFrame.
    """
    filtered_df = df.copy()
    for column, condition in conditions.items():
        filtered_df = filtered_df[condition(filtered_df[column])]
    return filtered_df

# Example usage (commented out for now)
# df = pd.DataFrame({
#     'A': [1, 2, 3, 4, 5],
#     'B': [10, 20, 30, 40, 50]
# })
# stats = calculate_statistics(df)
# df_normalized = normalize_data(df)
# df_filtered = filter_data(df, {'A': lambda x: x > 2})

# Visualization Examples
Demonstrate basic plotting with matplotlib and create sample visualizations.

In [None]:
# Visualization Examples

# Generate sample data
data = {
    'Category': ['A', 'B', 'C', 'D'],
    'Values': [23, 45, 56, 78]
}
df = pd.DataFrame(data)

# Bar plot
plt.figure(figsize=(8, 5))
plt.bar(df['Category'], df['Values'], color='skyblue')
plt.title('Bar Plot Example')
plt.xlabel('Category')
plt.ylabel('Values')
plt.show()

# Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.figure(figsize=(8, 5))
plt.plot(x, y, label='Sine Wave', color='green')
plt.title('Line Plot Example')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
plt.show()

# Scatter plot
np.random.seed(42)
x = np.random.rand(50)
y = np.random.rand(50)
sizes = np.random.rand(50) * 100

plt.figure(figsize=(8, 5))
plt.scatter(x, y, s=sizes, alpha=0.5, color='purple')
plt.title('Scatter Plot Example')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()

# Histogram
data = np.random.randn(1000)

plt.figure(figsize=(8, 5))
plt.hist(data, bins=30, color='orange', edgecolor='black')
plt.title('Histogram Example')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()