# Assignment 1:
### - The automated_stat_analyzer Function
- Scenario: A retail company needs a utility to quickly summarize sales data. Students must create a function that identifies the 
"Central Tendency" and "Dispersion" of any numerical column.
- ### Requirements:

* Accept a Pandas DataFrame and a column name.

* Calculate the Mean, Median, and Standard Deviation .

* Identify if the data is "Skewed" by comparing the Mean and Median.


* Bonus: If the column is categorical, return the Mode instead.

### Your Data

In [None]:
import pandas as pd
import numpy as np

def automated_stat_analyzer(df, column_name):
    """
    Analyze a column in a DataFrame and return basic statistics:
    - For numeric columns: Mean, Median, Std Dev, and Skewness info
    - For categorical columns: Mode
    """
    # Check if the column exists
    if column_name not in df.columns:
        return f"Column '{column_name}' does not exist in the DataFrame."

    col = df[column_name]

    # If the column is numeric
    if pd.api.types.is_numeric_dtype(col):
        # Handle missing values by dropping them
        col_clean = col.dropna()
        mean_val = col_clean.mean()
        median_val = col_clean.median()
        std_val = col_clean.std()

        # Determine skewness by comparing mean and median
        if mean_val > median_val:
            skewness = "Positively Skewed"
        elif mean_val < median_val:
            skewness = "Negatively Skewed"
        else:
            skewness = "Approximately Symmetric"

        return {
            "Mean": mean_val,
            "Median": median_val,
            "Standard Deviation": std_val,
            "Skewness": skewness
        }

    # If the column is categorical
    elif pd.api.types.is_object_dtype(col) or pd.api.types.is_categorical_dtype(col):
        mode_val = col.mode()
        # Mode can return multiple values if tie
        mode_val = mode_val.tolist()
        return {"Mode": mode_val}

    else:
        return f"Column '{column_name}' has unsupported data type for analysis."

# Example usage on your test dataset
data = {
    'Transaction_ID': range(1, 11),
    'Product_Category': ['Electronics', 'Home', 'Electronics', 'Sports', 'Home', 
                         'Electronics', 'Home', 'Sports', 'Electronics', 'Electronics'],
    'Sales_Amount': [150, 200, 155, 300, 210, 180, 205, 1000, 190, 160], # 1000 is an Outlier
    'Customer_Age': [25, 34, np.nan, 45, 23, 31, 29, np.nan, 38, 40],    # Contains Nulls (NaN)
    'Rating': [5, 4, 3, 5, 2, 4, 5, 2, 4, 3]
}

df_test = pd.DataFrame(data)

# Test numeric column
print(automated_stat_analyzer(df_test, "Sales_Amount"))

# Test numeric column with NaN
print(automated_stat_analyzer(df_test, "Customer_Age"))

# Test categorical column
print(automated_stat_analyzer(df_test, "Product_Category"))


Test dataset created successfully!


In [2]:
df_test.head()

Unnamed: 0,Transaction_ID,Product_Category,Sales_Amount,Customer_Age,Rating
0,1,Electronics,150,25.0,5
1,2,Home,200,34.0,4
2,3,Electronics,155,,3
3,4,Sports,300,45.0,5
4,5,Home,210,23.0,2


In [None]:
import pandas as pd
import numpy as np

def automated_stat_analyzer(df, column_name):
    """
    Company Task: Provide a summary report of a specific data variable.
    
    Instructions:
    1. Check if the column is numerical or categorical.
    2. For numerical: Calculate Mean, Median, and Standard Deviation.
    3. For categorical: Calculate the Mode.
    4. Return a dictionary with these statistical measures.
    """
    if column_name not in df.columns:
        return f"Column '{column_name}' does not exist in the DataFrame."
    
    col = df[column_name]

    # Numerical column
    if pd.api.types.is_numeric_dtype(col):
        col_clean = col.dropna()  # Ignore NaNs
        mean_val = col_clean.mean()
        median_val = col_clean.median()
        std_val = col_clean.std()

        # Determine skewness
        if mean_val > median_val:
            skewness = "Positively Skewed"
        elif mean_val < median_val:
            skewness = "Negatively Skewed"
        else:
            skewness = "Approximately Symmetric"

        return {
            "Mean": mean_val,
            "Median": median_val,
            "Standard Deviation": std_val,
            "Skewness": skewness
        }

    # Categorical column
    elif pd.api.types.is_object_dtype(col) or pd.api.types.is_categorical_dtype(col):
        mode_val = col.mode().tolist()  # mode() can return multiple values
        return {"Mode": mode_val}

    else:
        return f"Column '{column_name}' has unsupported data type for analysis."


# ---------------------------
# Example Test
# ---------------------------
data = {
    'Transaction_ID': range(1, 11),
    'Product_Category': ['Electronics', 'Home', 'Electronics', 'Sports', 'Home', 
                         'Electronics', 'Home', 'Sports', 'Electronics', 'Electronics'],
    'Sales_Amount': [150, 200, 155, 300, 210, 180, 205, 1000, 190, 160],
    'Customer_Age': [25, 34, np.nan, 45, 23, 31, 29, np.nan, 38, 40],
    'Rating': [5, 4, 3, 5, 2, 4, 5, 2, 4, 3]
}

df_test = pd.DataFrame(data)

# Test numeric column
print(automated_stat_analyzer(df_test, "Sales_Amount"))
# Test numeric with NaNs
print(automated_stat_analyzer(df_test, "Customer_Age"))
# Test categorical column
print(automated_stat_analyzer(df_test, "Product_Category"))


## Assignment 2: 
  ### The null_handling_strategy Function


#### Scenario: Incoming user data often has missing values.Students must implement a flexible strategy to handle these "Null Values" to prepare data for Machine Learning.
### Requirements:

* Check for null values in the DataFrame.

* Apply a strategy based on parameters: "drop_rows", "fill_mean", or "fill_median" .

* Ensure the function only fills numerical columns when using mean or median.

In [None]:
import pandas as pd
import numpy as np

def null_handling_strategy(df, strategy="fill_mean"):
    """
    Company Task: Clean a dataset by resolving missing (NaN) values.
    
    Parameters:
    - df: pandas DataFrame
    - strategy: one of "drop_rows", "fill_mean", "fill_median"
    
    Returns:
    - A DataFrame with NaNs handled according to the chosen strategy.
    """
    # Make a copy to avoid modifying original DataFrame
    df_clean = df.copy()

    # Check if there are any nulls
    if df_clean.isnull().sum().sum() == 0:
        print("No missing values found.")
        return df_clean

    # Strategy 1: Drop rows with any NaNs
    if strategy == "drop_rows":
        df_clean = df_clean.dropna()
    
    # Strategy 2: Fill numerical columns with mean
    elif strategy == "fill_mean":
        for col in df_clean.select_dtypes(include=np.number).columns:
            mean_val = df_clean[col].mean()
            df_clean[col].fillna(mean_val, inplace=True)
    
    # Strategy 3: Fill numerical columns with median
    elif strategy == "fill_median":
        for col in df_clean.select_dtypes(include=np.number).columns:
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
    
    else:
        raise ValueError("Strategy must be 'drop_rows', 'fill_mean', or 'fill_median'")
    
    return df_clean


# -------------------------
# Example Test
# -------------------------
data = {
    'Transaction_ID': [1, 2, 3, 4, 5],
    'Product_Category': ['Electronics', 'Home', 'Electronics', 'Sports', 'Home'],
    'Sales_Amount': [150, 200, 155, 300, 210],
    'Customer_Age': [25, 34, np.nan, 45, 23],
    'Rating': [5, 4, 3, 5, 2]
}

df_test = pd.DataFrame(data)

# Drop rows with NaNs
print("Drop Rows Strategy:")
print(null_handling_strategy(df_test, "drop_rows"))
print("\nFill Mean Strategy:")
print(null_handling_strategy(df_test, "fill_mean"))
print("\nFill Median Strategy:")
print(null_handling_strategy(df_test, "fill_median"))
