In [None]:
# Load Data & Identify numerical columns in the dataset.
import pandas as pd

# Load the dataset
data = pd.read_csv('sales_data_with_discounts.csv')

# Displaying the first few rows and data types to identify numerical columns
data.head(), data.dtypes

In [None]:
# Calculate the mean, median, mode, and standard deviation for these columns.
numerical_columns = ['Volume', 'Avg Price', 'Total Sales Value', 
                     'Discount Rate (%)', 'Discount Amount', 'Net Sales Value']

# mean, median, mode, and standard deviation
statistics = {
    'Mean': data[numerical_columns].mean(),
    'Median': data[numerical_columns].median(),
    'Mode': data[numerical_columns].mode().iloc[0],  # Taking the first mode for each column
    'Standard Deviation': data[numerical_columns].std()
}

# Converting the result to a DataFrame
stats_df = pd.DataFrame(statistics)
stats_df

In [None]:
# **** Provide a brief interpretation of these statistics.
# ---> Mean: The average value, showing the overall trend or typical value of the data.
# ---> Median: The middle value when sorted, indicating skewness if it differs from the mean.
# ---> Mode: The most frequent value, highlighting common trends in the data.
# Standard Deviation: The measure of spread, showing how much values vary around the mean.

In [None]:
# Plot histograms for each numerical column.
import matplotlib.pyplot as plt

# Plotting histograms for each numerical column
numerical_columns = ['Volume', 'Avg Price', 'Total Sales Value', 
                     'Discount Rate (%)', 'Discount Amount', 'Net Sales Value']

# Setting up the figure
fig, axes = plt.subplots(len(numerical_columns), 1, figsize=(10, 20))
fig.tight_layout(pad=5.0)

# Plotting each histogram
for i, col in enumerate(numerical_columns):
    axes[i].hist(data[col], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Histogram of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

plt.show()


In [None]:
# Analyze the distribution (e.g., skewness, presence of outliers) and provide inferences
import seaborn as sns

# Calculate skewness for each numerical column
skewness = data[numerical_columns.columns].skew()

# Plot boxplots to check for outliers
for column in numerical_columns.columns:
    sns.boxplot(data[column])
    plt.title(f'Boxplot of {column}')
    plt.show()

# Print skewness values for interpretation
print(skewness)


In [None]:
# Create boxplots for numerical variables to identify outliers and the interquartile range.
import seaborn as sns
import matplotlib.pyplot as plt

# List of numerical columns to create boxplots for
numerical_columns = ['Volume', 'Avg Price', 'Total Sales Value', 
                     'Discount Rate (%)', 'Discount Amount', 'Net Sales Value']

# Plotting boxplots for each numerical column
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_columns):
    plt.subplot(2, 3, i+1)
    sns.boxplot(data=data, x=col, color='lightgreen', fliersize=6)
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    
plt.tight_layout()
plt.show()


In [None]:
# Identify categorical columns in the dataset.
import pandas as pd
# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_columns

In [None]:
# Create bar charts to visualize the frequency or count of each category.
import matplotlib.pyplot as plt

# Create bar charts for each categorical column
for col in categorical_columns:
    plt.figure(figsize=(10, 6))
    data[col].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
    plt.title(f'Frequency of Categories in {col}', fontsize=16)
    plt.xlabel(f'{col}', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.xticks(rotation=45, fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze the distribution of categories and provide insights.
category_distribution = {}

for col in categorical_columns:
    value_counts = data[col].value_counts()
    total = value_counts.sum()
    distribution_info = {
        'unique_categories': len(value_counts),
        'top_category': value_counts.idxmax(),
        'top_category_count': value_counts.max(),
        'top_category_percentage': (value_counts.max() / total) * 100,
    }
    category_distribution[col] = distribution_info

category_distribution_df = pd.DataFrame(category_distribution).T
category_distribution_df

In [None]:
# **** Explain the concept of standardization (z-score normalization).

# Standardization, also called Z-score normalization, transforms a dataset to have a mean (μ) of 0 and a standard deviation (σ) of 1. 
# This ensures that each feature contributes equally to the analysis, which is particularly useful for distance-based models like k-means or SVMs.

# The formula for standardization is:
# 𝑧 = 𝑥 − 𝜇 / 𝜎
# Where:

# x is the original value,
# μ is the mean of the column,
# σ is the standard deviation of the column,
# z is the standardized value.

In [None]:
# Standardize the numerical columns using the formula: z=x-mu/sigma
# Show before and after comparisons of the data distributions.
import seaborn as sns

# Standardize the numerical columns using z-score normalization
standardized_data = data.copy()

for col in numerical_columns:
    mean = data[col].mean()
    std = data[col].std()
    standardized_data[col] = (data[col] - mean) / std

# Compare distributions before and after standardization
for col in numerical_columns:
    plt.figure(figsize=(12, 6))
    
    # Original distribution
    sns.histplot(data[col], kde=True, color='blue', label='Original', stat='density', bins=30, alpha=0.6)
    
    # Standardized distribution
    sns.histplot(standardized_data[col], kde=True, color='orange', label='Standardized', stat='density', bins=30, alpha=0.6)
    
    plt.title(f"Distribution Before and After Standardization: {col}", fontsize=16)
    plt.xlabel(col, fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
****Discuss the need for converting categorical data into dummy variables (one-hot encoding).

Need for Converting Categorical Data into Dummy Variables (One-Hot Encoding)
Categorical data represents labels or categories that can’t be directly used by most machine learning algorithms. These algorithms expect numerical inputs and can misinterpret categorical data as ordinal (implying order or magnitude). Converting categorical data into dummy variables solves this issue.

What is One-Hot Encoding?
One-hot encoding transforms categorical variables into binary vectors (0s and 1s). Each category is represented as a separate column, with a value of 1 indicating the presence of that category in a particular row and 0 otherwise.

For example:

City	One-Hot Encoded Columns
New York	[1, 0, 0]
Paris	    [0, 1, 0]
Tokyo	    [0, 0, 1]

In [None]:
# Apply one-hot encoding to the categorical columns, creating binary (0 or 1) columns for each category.
# Display a portion of the transformed dataset.
categorical_columns = ['Date', 'Day', 'SKU', 'City', 'BU', 'Brand', 'Model']  # Replace with your column names
one_hot_encoded_data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

print(one_hot_encoded_data.head())