In [None]:
# Data Profiling Techniques Examples

# 1. Descriptive Statistics:
# Task 1: Calculate the mean, median, and mode for sales figures in a retail dataset.
# Task 2: Analyze the average age, median, and mode in a customer demographic
# dataset.
# Task 3: Determine the mean, median, and mode of daily website visit counts.
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Sample retail sales data (in dollars)
sales_data = [
    120, 145, 135, 210, 195, 165, 170, 155, 190, 230,
    145, 165, 175, 185, 205, 225, 140, 160, 150, 180,
    190, 200, 185, 170, 190, 210, 235, 180, 175, 165
]

# Create a DataFrame for better visualization
df = pd.DataFrame({'Sales': sales_data})

# 1. Basic Descriptive Statistics
print("===== BASIC DESCRIPTIVE STATISTICS =====")
print(f"Count: {len(sales_data)}")
print(f"Min: ${min(sales_data)}")
print(f"Max: ${max(sales_data)}")
print(f"Range: ${max(sales_data) - min(sales_data)}")
print(f"Sum: ${sum(sales_data)}")

# 2. Mean, Median, Mode Calculation
# Method 1: Using basic Python
mean_sales = sum(sales_data) / len(sales_data)
sorted_sales = sorted(sales_data)
mid = len(sorted_sales) // 2
# If even number of elements, average the two middle values
if len(sorted_sales) % 2 == 0:
    median_sales = (sorted_sales[mid-1] + sorted_sales[mid]) / 2
else:
    median_sales = sorted_sales[mid]

# Finding mode using a frequency counter
from collections import Counter
sales_counter = Counter(sales_data)
mode_sales = sales_counter.most_common(1)[0][0]
mode_count = sales_counter.most_common(1)[0][1]

print("\n===== MANUAL CALCULATION =====")
print(f"Mean: ${mean_sales:.2f}")
print(f"Median: ${median_sales:.2f}")
print(f"Mode: ${mode_sales} (appears {mode_count} times)")

# Method 2: Using NumPy and SciPy
np_mean = np.mean(sales_data)
np_median = np.median(sales_data)
mode_result = stats.mode(sales_data)
# Handle both older and newer versions of scipy
try:
    # For newer scipy versions where mode() returns scalar values when input is 1D
    np_mode = mode_result.mode
    np_mode_count = mode_result.count
except (IndexError, AttributeError):
    # For older scipy versions where mode returns arrays
    np_mode = mode_result.mode[0]
    np_mode_count = mode_result.count[0]

print("\n===== NUMPY/SCIPY CALCULATION =====")
print(f"Mean: ${np_mean:.2f}")
print(f"Median: ${np_median:.2f}")
print(f"Mode: ${np_mode} (appears {np_mode_count} times)")

# Method 3: Using Pandas
print("\n===== PANDAS CALCULATION =====")
print(df['Sales'].describe())
print(f"Mode: ${df['Sales'].mode()[0]}")

# 3. Visual Analysis
plt.figure(figsize=(12, 8))

# Histogram
plt.subplot(2, 2, 1)
plt.hist(sales_data, bins=10, edgecolor='black')
plt.axvline(np_mean, color='r', linestyle='dashed', linewidth=1, label=f'Mean: ${np_mean:.2f}')
plt.axvline(np_median, color='g', linestyle='dashed', linewidth=1, label=f'Median: ${np_median:.2f}')
plt.axvline(np_mode, color='b', linestyle='dashed', linewidth=1, label=f'Mode: ${np_mode}')
plt.title('Sales Distribution')
plt.xlabel('Sales Amount ($)')
plt.ylabel('Frequency')
plt.legend()

# Box Plot
plt.subplot(2, 2, 2)
plt.boxplot(sales_data, patch_artist=True)
plt.title('Sales Box Plot')
plt.ylabel('Sales Amount ($)')
plt.grid(True, linestyle='--', alpha=0.7)

# Density Plot (KDE)
plt.subplot(2, 2, 3)
sns.kdeplot(sales_data, fill=True)
plt.axvline(np_mean, color='r', linestyle='dashed', linewidth=1, label=f'Mean: ${np_mean:.2f}')
plt.axvline(np_median, color='g', linestyle='dashed', linewidth=1, label=f'Median: ${np_median:.2f}')
plt.axvline(np_mode, color='b', linestyle='dashed', linewidth=1, label=f'Mode: ${np_mode}')
plt.title('Sales Density Plot')
plt.xlabel('Sales Amount ($)')
plt.legend()

# QQ Plot to check for normality
plt.subplot(2, 2, 4)
stats.probplot(sales_data, plot=plt)
plt.title('QQ Plot')

plt.tight_layout()
plt.savefig('sales_analysis.png')
plt.show()

# 4. Additional Insights
print("\n===== ADDITIONAL STATISTICS =====")
print(f"Standard Deviation: ${np.std(sales_data, ddof=1):.2f}")
print(f"Variance: ${np.var(sales_data, ddof=1):.2f}")
print(f"Skewness: {stats.skew(sales_data):.4f}")
print(f"Kurtosis: {stats.kurtosis(sales_data):.4f}")

# 5. Percentiles
print("\n===== PERCENTILES =====")
print(f"25th Percentile (Q1): ${np.percentile(sales_data, 25):.2f}")
print(f"50th Percentile (Median): ${np.percentile(sales_data, 50):.2f}")
print(f"75th Percentile (Q3): ${np.percentile(sales_data, 75):.2f}")
print(f"IQR (Interquartile Range): ${np.percentile(sales_data, 75) - np.percentile(sales_data, 25):.2f}")

# 6. Identification of Outliers using IQR method
Q1 = np.percentile(sales_data, 25)
Q3 = np.percentile(sales_data, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = [x for x in sales_data if x < lower_bound or x > upper_bound]

print("\n===== OUTLIER DETECTION =====")
print(f"Lower Bound: ${lower_bound:.2f}")
print(f"Upper Bound: ${upper_bound:.2f}")
if outliers:
    print(f"Outliers: {outliers}")
else:
    print("No outliers detected")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

# Sample customer demographic data (ages)
customer_ages = [
    22, 35, 28, 42, 55, 38, 41, 32, 45, 29,
    31, 48, 52, 33, 27, 39, 44, 36, 29, 51,
    43, 38, 35, 47, 56, 34, 29, 41, 33, 38,
    42, 50, 31, 29, 45, 37, 39, 32, 44, 53,
    28, 36, 41, 47, 33, 38, 30, 49, 35, 42
]

# Create DataFrame for better analysis
customers_df = pd.DataFrame({'Age': customer_ages})

# 1. Basic Information
print("===== CUSTOMER AGE DATA SUMMARY =====")
print(f"Number of customers: {len(customer_ages)}")
print(f"Youngest customer: {min(customer_ages)} years old")
print(f"Oldest customer: {max(customer_ages)} years old")
print(f"Age range: {max(customer_ages) - min(customer_ages)} years")

# 2. Mean, Median, Mode Calculation using different methods

# Method 1: Manual calculation
mean_age = sum(customer_ages) / len(customer_ages)
sorted_ages = sorted(customer_ages)
n = len(sorted_ages)
if n % 2 == 0:
    median_age = (sorted_ages[n//2 - 1] + sorted_ages[n//2]) / 2
else:
    median_age = sorted_ages[n//2]

# Finding mode manually
age_counter = Counter(customer_ages)
mode_age = age_counter.most_common(1)[0][0]
mode_count = age_counter.most_common(1)[0][1]

print("\n===== MANUAL CALCULATION =====")
print(f"Mean age: {mean_age:.2f} years")
print(f"Median age: {median_age:.2f} years")
print(f"Mode age: {mode_age} years (appears {mode_count} times)")

# Method 2: Using NumPy and SciPy
np_mean = np.mean(customer_ages)
np_median = np.median(customer_ages)
mode_result = stats.mode(customer_ages)
# Handle both older and newer versions of scipy
try:
    # For newer scipy versions where mode() returns scalar values when input is 1D
    np_mode = mode_result.mode
    np_mode_count = mode_result.count
except (IndexError, AttributeError):
    # For older scipy versions where mode returns arrays
    np_mode = mode_result.mode[0]
    np_mode_count = mode_result.count[0]

print("\n===== NUMPY/SCIPY CALCULATION =====")
print(f"Mean age: {np_mean:.2f} years")
print(f"Median age: {np_median:.2f} years")
print(f"Mode age: {np_mode} years (appears {np_mode_count} times)")

# Method 3: Using Pandas
print("\n===== PANDAS CALCULATION =====")
print(customers_df['Age'].describe())
print(f"Mode: {customers_df['Age'].mode()[0]} years")

# 3. Age distribution analysis by grouping
# Group customers into age brackets
age_brackets = {
    '18-25': 0,
    '26-35': 0,
    '36-45': 0,
    '46-55': 0,
    '56+': 0
}

for age in customer_ages:
    if 18 <= age <= 25:
        age_brackets['18-25'] += 1
    elif 26 <= age <= 35:
        age_brackets['26-35'] += 1
    elif 36 <= age <= 45:
        age_brackets['36-45'] += 1
    elif 46 <= age <= 55:
        age_brackets['46-55'] += 1
    else:
        age_brackets['56+'] += 1

print("\n===== AGE DISTRIBUTION BY BRACKET =====")
for bracket, count in age_brackets.items():
    percentage = (count / len(customer_ages)) * 100
    print(f"{bracket}: {count} customers ({percentage:.1f}%)")

# 4. Visual Analysis
plt.figure(figsize=(15, 10))

# Histogram with KDE
plt.subplot(2, 2, 1)
sns.histplot(customer_ages, kde=True, bins=10)
plt.axvline(np_mean, color='r', linestyle='dashed', linewidth=1, label=f'Mean: {np_mean:.2f}')
plt.axvline(np_median, color='g', linestyle='dashed', linewidth=1, label=f'Median: {np_median:.2f}')
plt.axvline(np_mode, color='b', linestyle='dashed', linewidth=1, label=f'Mode: {np_mode}')
plt.title('Customer Age Distribution')
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.legend()

# Box Plot
plt.subplot(2, 2, 2)
sns.boxplot(y=customer_ages)
plt.title('Age Box Plot')
plt.ylabel('Age (years)')

# Bar chart for age brackets
plt.subplot(2, 2, 3)
plt.bar(age_brackets.keys(), age_brackets.values(), color='skyblue', edgecolor='black')
plt.title('Customer Age Brackets')
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
for i, v in enumerate(age_brackets.values()):
    plt.text(i, v + 0.5, str(v), ha='center')

# Violin plot
plt.subplot(2, 2, 4)
sns.violinplot(y=customer_ages)
plt.title('Age Distribution (Violin Plot)')
plt.ylabel('Age (years)')

plt.tight_layout()
plt.savefig('customer_age_analysis.png')
plt.show()

# 5. Additional Statistics
print("\n===== ADDITIONAL STATISTICS =====")
print(f"Standard Deviation: {np.std(customer_ages, ddof=1):.2f} years")
print(f"Variance: {np.var(customer_ages, ddof=1):.2f}")
print(f"Skewness: {stats.skew(customer_ages):.4f}")
print(f"Kurtosis: {stats.kurtosis(customer_ages):.4f}")

# 6. Percentiles and IQR
Q1 = np.percentile(customer_ages, 25)
Q3 = np.percentile(customer_ages, 75)
IQR = Q3 - Q1

print("\n===== PERCENTILES =====")
print(f"25th Percentile (Q1): {Q1} years")
print(f"50th Percentile (Median): {np_median} years")
print(f"75th Percentile (Q3): {Q3} years")
print(f"IQR (Interquartile Range): {IQR} years")

# 7. Outlier detection
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = [age for age in customer_ages if age < lower_bound or age > upper_bound]

print("\n===== OUTLIER DETECTION =====")
print(f"Lower Bound: {lower_bound:.2f} years")
print(f"Upper Bound: {upper_bound:.2f} years")
if outliers:
    print(f"Outliers: {outliers}")
else:
    print("No outliers detected")

# 8. Calculate median absolute deviation (MAD) - robust measure of variability
mad = np.median(np.abs(customer_ages - np_median))
print(f"\nMedian Absolute Deviation (MAD): {mad:.2f}")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter
from datetime import datetime, timedelta

# Sample daily website visit data (30 days)
website_visits = [
    1254, 1380, 1432, 1198, 985, 876, 912,
    1342, 1487, 1511, 1387, 1289, 1345, 1036,
    987, 921, 1112, 1346, 1472, 1506,
    1387, 1290, 1145, 1032, 876, 921, 1087,
    1345, 1432, 1380
]

# Create dates for our sample data (last 30 days)
today = datetime.now().date()
dates = [(today - timedelta(days=i)) for i in range(30, 0, -1)]
dates_str = [date.strftime('%Y-%m-%d') for date in dates]

# Create DataFrame
visits_df = pd.DataFrame({
    'Date': dates_str,
    'Visits': website_visits
})

# 1. Basic information
print("===== WEBSITE VISITS DATA SUMMARY =====")
print(f"Total days analyzed: {len(website_visits)}")
print(f"Total visits: {sum(website_visits)}")
print(f"Minimum daily visits: {min(website_visits)}")
print(f"Maximum daily visits: {max(website_visits)}")

# 2. Mean, Median, Mode calculations

# Method 1: Manual calculation
mean_visits = sum(website_visits) / len(website_visits)
sorted_visits = sorted(website_visits)
n = len(sorted_visits)
if n % 2 == 0:
    median_visits = (sorted_visits[n//2 - 1] + sorted_visits[n//2]) / 2
else:
    median_visits = sorted_visits[n//2]

# Finding mode manually
visits_counter = Counter(website_visits)
mode_visits = visits_counter.most_common(1)[0][0]
mode_count = visits_counter.most_common(1)[0][1]

print("\n===== MANUAL CALCULATION =====")
print(f"Mean visits per day: {mean_visits:.2f}")
print(f"Median visits per day: {median_visits:.2f}")
print(f"Mode visits per day: {mode_visits} (occurs {mode_count} times)")

# Method 2: Using NumPy and SciPy
np_mean = np.mean(website_visits)
np_median = np.median(website_visits)
mode_result = stats.mode(website_visits)
# Handle both older and newer versions of scipy
try:
    # For newer scipy versions where mode() returns scalar values when input is 1D
    np_mode = mode_result.mode
    np_mode_count = mode_result.count
except (IndexError, AttributeError):
    # For older scipy versions where mode returns arrays
    np_mode = mode_result.mode[0]
    np_mode_count = mode_result.count[0]

print("\n===== NUMPY/SCIPY CALCULATION =====")
print(f"Mean visits per day: {np_mean:.2f}")
print(f"Median visits per day: {np_median:.2f}")
print(f"Mode visits per day: {np_mode} (occurs {np_mode_count} times)")

# Method 3: Using Pandas
print("\n===== PANDAS CALCULATION =====")
print(visits_df['Visits'].describe())
print(f"Mode: {visits_df['Visits'].mode()[0]}")

# 3. Day of week analysis (assuming our date range includes all days of week)
# Add day of week column
visits_df['Date'] = pd.to_datetime(visits_df['Date'])
visits_df['DayOfWeek'] = visits_df['Date'].dt.day_name()

# Calculate average visits by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_visits = visits_df.groupby('DayOfWeek')['Visits'].agg(['mean', 'median', 'count'])
dow_visits = dow_visits.reindex(day_order)

print("\n===== VISITS BY DAY OF WEEK =====")
print(dow_visits)

# 4. Visualization
plt.figure(figsize=(15, 10))

# Time series line plot
plt.subplot(2, 2, 1)
plt.plot(visits_df['Date'], visits_df['Visits'], marker='o', linestyle='-')
plt.axhline(y=np_mean, color='r', linestyle='--', label=f'Mean: {np_mean:.2f}')
plt.axhline(y=np_median, color='g', linestyle='--', label=f'Median: {np_median:.2f}')
plt.title('Daily Website Visits Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

# Histogram
plt.subplot(2, 2, 2)
sns.histplot(website_visits, kde=True, bins=10)
plt.axvline(np_mean, color='r', linestyle='--', label=f'Mean: {np_mean:.2f}')
plt.axvline(np_median, color='g', linestyle='--', label=f'Median: {np_median:.2f}')
plt.axvline(np_mode, color='b', linestyle='--', label=f'Mode: {np_mode}')
plt.title('Distribution of Daily Visits')
plt.xlabel('Number of Visits')
plt.ylabel('Frequency')
plt.legend()

# Box plot
plt.subplot(2, 2, 3)
sns.boxplot(y=website_visits)
plt.title('Daily Visits Box Plot')
plt.ylabel('Number of Visits')

# Bar chart by day of week
plt.subplot(2, 2, 4)
sns.barplot(x=dow_visits.index, y=dow_visits['mean'])
plt.title('Average Visits by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Visits')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('website_visits_analysis.png')
plt.show()

# 5. Trend Analysis
# Calculate moving average (7-day window)
visits_df['7_Day_MA'] = visits_df['Visits'].rolling(window=7).mean()

# Plot with moving average
plt.figure(figsize=(12, 6))
plt.plot(visits_df['Date'], visits_df['Visits'], marker='o', linestyle='-', label='Daily Visits')
plt.plot(visits_df['Date'], visits_df['7_Day_MA'], color='red', linewidth=2, label='7-Day Moving Average')
plt.title('Daily Website Visits with 7-Day Moving Average')
plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('website_visits_trend.png')
plt.show()

# 6. Additional Statistics
print("\n===== ADDITIONAL STATISTICS =====")
print(f"Standard Deviation: {np.std(website_visits, ddof=1):.2f}")
print(f"Variance: {np.var(website_visits, ddof=1):.2f}")
print(f"Coefficient of Variation: {(np.std(website_visits, ddof=1) / np_mean) * 100:.2f}%")
print(f"Skewness: {stats.skew(website_visits):.4f}")
print(f"Kurtosis: {stats.kurtosis(website_visits):.4f}")

# 7. Percentiles
print("\n===== PERCENTILES =====")
percentiles = [10, 25, 50, 75, 90, 95]
for p in percentiles:
    print(f"{p}th Percentile: {np.percentile(website_visits, p):.1f} visits")

# 8. Outlier detection
Q1 = np.percentile(website_visits, 25)
Q3 = np.percentile(website_visits, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = [(i, visits) for i, visits in enumerate(website_visits) if visits < lower_bound or visits > upper_bound]

print("\n===== OUTLIER DETECTION =====")
print(f"Lower Bound: {lower_bound:.2f}")
print(f"Upper Bound: {upper_bound:.2f}")
if outliers:
    print(f"Outliers detected on days: {outliers}")
else:
    print("No outliers detected")

# 9. Weekend vs. Weekday analysis
visits_df['IsWeekend'] = visits_df['DayOfWeek'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)
weekday_avg = visits_df[visits_df['IsWeekend'] == 0]['Visits'].mean()
weekend_avg = visits_df[visits_df['IsWeekend'] == 1]['Visits'].mean()

print("\n===== WEEKEND VS WEEKDAY =====")
print(f"Average weekday visits: {weekday_avg:.2f}")
print(f"Average weekend visits: {weekend_avg:.2f}")
print(f"Difference: {abs(weekday_avg - weekend_avg):.2f}")
print(f"Weekend traffic is {(weekend_avg/weekday_avg - 1) * 100:.2f}% {'higher' if weekend_avg > weekday_avg else 'lower'} than weekday traffic")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import matplotlib.ticker as ticker

# Set the style for our visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Create a sample product price dataset
np.random.seed(42)  # For reproducibility

# Generate a mix of product prices with different distributions
# Low-priced products (e.g., consumables, accessories)
low_prices = np.random.gamma(shape=2, scale=5, size=250)
# Mid-priced products (e.g., regular items)
mid_prices = np.random.normal(loc=50, scale=12, size=500)
# High-priced products (e.g., premium items)
high_prices = np.random.lognormal(mean=4.0, sigma=0.4, size=250)

# Combine all prices
all_prices = np.concatenate([low_prices, mid_prices, high_prices])

# Create a DataFrame
product_categories = ['Low-end'] * 250 + ['Mid-range'] * 500 + ['Premium'] * 250
product_data = pd.DataFrame({
    'Price': all_prices,
    'Category': product_categories
})

# Round prices to 2 decimal places to simulate real-world pricing
product_data['Price'] = product_data['Price'].round(2)

# Display basic statistics
print("===== PRODUCT PRICE DISTRIBUTION ANALYSIS =====")
print("\nBasic Statistics:")
print(product_data['Price'].describe())

# Calculate additional distribution metrics
skewness = stats.skew(product_data['Price'])
kurtosis = stats.kurtosis(product_data['Price'])

print(f"\nSkewness: {skewness:.4f}")
print(f"Kurtosis: {kurtosis:.4f}")

# Print distribution by category
print("\nDistribution by Category:")
category_stats = product_data.groupby('Category')['Price'].describe()
print(category_stats)

# 1. Basic Histogram
plt.figure(figsize=(14, 10))

plt.subplot(2, 2, 1)
sns.histplot(product_data['Price'], bins=30, kde=True)
plt.title('Distribution of All Product Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.axvline(product_data['Price'].mean(), color='red', linestyle='--', label=f'Mean: ${product_data["Price"].mean():.2f}')
plt.axvline(product_data['Price'].median(), color='green', linestyle='--', label=f'Median: ${product_data["Price"].median():.2f}')
plt.legend()

# Format x-axis as currency
plt.gca().xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:.0f}'))

# 2. Histogram with Log Scale (helpful for skewed distributions)
plt.subplot(2, 2, 2)
sns.histplot(product_data['Price'], bins=30, kde=True, log_scale=True)
plt.title('Product Price Distribution (Log Scale)')
plt.xlabel('Price ($) - Log Scale')
plt.ylabel('Frequency')

# Format x-axis as currency
plt.gca().xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:.0f}'))

# 3. Price distribution by product category
plt.subplot(2, 2, 3)
sns.boxplot(x='Category', y='Price', data=product_data, palette='Set3')
plt.title('Price Distribution by Product Category')
plt.xlabel('Product Category')
plt.ylabel('Price ($)')

# Format y-axis as currency
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('${x:.0f}'))

# 4. Violin plot by category
plt.subplot(2, 2, 4)
sns.violinplot(x='Category', y='Price', data=product_data, palette='Set3', inner='quartile')
plt.title('Violin Plot of Price by Category')
plt.xlabel('Product Category')
plt.ylabel('Price ($)')

# Format y-axis as currency
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('${x:.0f}'))

plt.tight_layout()
plt.savefig('product_price_distribution.png')
plt.show()

# 5. Create a second figure for additional distribution visualizations
plt.figure(figsize=(14, 10))

# CDF (Cumulative Distribution Function)
plt.subplot(2, 2, 1)
# Sort the data for CDF
sorted_data = np.sort(product_data['Price'])
# Calculate the proportional values of samples
p = 1. * np.arange(len(sorted_data)) / (len(sorted_data) - 1)
plt.plot(sorted_data, p)
plt.title('Cumulative Distribution Function (CDF) of Prices')
plt.xlabel('Price ($)')
plt.ylabel('Cumulative Probability')
plt.grid(True, alpha=0.3)

# Format x-axis as currency
plt.gca().xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:.0f}'))

# Kernel Density Estimation by category
plt.subplot(2, 2, 2)
for category in product_data['Category'].unique():
    sns.kdeplot(product_data[product_data['Category'] == category]['Price'], 
                label=category, fill=True, alpha=0.3)
plt.title('KDE by Product Category')
plt.xlabel('Price ($)')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)

# Format x-axis as currency
plt.gca().xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:.0f}'))

# Price Range Analysis
# Create price ranges
bins = [0, 10, 25, 50, 100, 200, float('inf')]
labels = ['$0-10', '$10-25', '$25-50', '$50-100', '$100-200', '$200+']
product_data['Price Range'] = pd.cut(product_data['Price'], bins=bins, labels=labels)

# Count products in each price range
price_range_counts = product_data['Price Range'].value_counts().sort_index()

plt.subplot(2, 2, 3)
price_range_counts.plot(kind='bar', color='skyblue')
plt.title('Number of Products by Price Range')
plt.xlabel('Price Range')
plt.ylabel('Number of Products')
plt.xticks(rotation=45)
for i, v in enumerate(price_range_counts):
    plt.text(i, v + 5, str(v), ha='center')

# QQ Plot to check for normality
plt.subplot(2, 2, 4)
stats.probplot(product_data['Price'], plot=plt)
plt.title('QQ Plot of Product Prices')

plt.tight_layout()
plt.savefig('product_price_distribution_additional.png')
plt.show()

# 6. Advanced Analysis

# Find price percentiles
percentiles = [10, 25, 50, 75, 90, 95, 99]
price_percentiles = np.percentile(product_data['Price'], percentiles)

print("\nPrice Percentiles:")
for i, p in enumerate(percentiles):
    print(f"{p}th Percentile: ${price_percentiles[i]:.2f}")

# Calculate the Gini coefficient to measure price inequality
def gini(x):
    """Calculate the Gini coefficient of a numpy array."""
    # Mean absolute difference
    mad = np.abs(np.subtract.outer(x, x)).mean()
    # Relative mean absolute difference
    rmad = mad / np.mean(x)
    # Gini coefficient
    g = 0.5 * rmad
    return g

gini_coefficient = gini(product_data['Price'].values)
print(f"\nGini Coefficient (Price Inequality): {gini_coefficient:.4f}")

# Create a stacked histogram by category with percentage
plt.figure(figsize=(12, 6))
category_order = ['Low-end', 'Mid-range', 'Premium']
product_data_sorted = product_data.copy()
product_data_sorted['Price Range'] = pd.Categorical(
    product_data_sorted['Price Range'], 
    categories=labels, 
    ordered=True
)

# Create a crosstab of category and price range
price_category_cross = pd.crosstab(
    product_data_sorted['Category'], 
    product_data_sorted['Price Range'],
    normalize='index'
) * 100

# Plot stacked bar chart
price_category_cross.loc[category_order].plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Price Range Distribution by Product Category (%)')
plt.xlabel('Product Category')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=0)
plt.legend(title='Price Range')
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('product_price_category_distribution.png')
plt.show()

# Optional: Save the distribution data to a CSV file
product_data.to_csv('product_price_data.csv', index=False)

print("\nDistribution analysis complete! Visualizations and data summary saved.")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import matplotlib.patches as mpatches

# Set the visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Generate sample exam data
np.random.seed(42)  # For reproducibility

# Create a dataset with scores from different subjects
n_students = 200

# Generate scores for different subjects with different distributions
# Math tends to have a bimodal distribution
math_scores = np.concatenate([
    np.random.normal(loc=65, scale=12, size=int(n_students*0.6)),  # Lower peak
    np.random.normal(loc=90, scale=8, size=int(n_students*0.4))    # Higher peak
])

# English often has a more normal distribution
english_scores = np.random.normal(loc=75, scale=15, size=n_students)

# Science may have a slightly skewed distribution
science_scores = np.random.beta(a=7, b=3, size=n_students) * 100

# History scores
history_scores = np.random.normal(loc=72, scale=14, size=n_students)

# Ensure all scores are within realistic bounds (0-100)
subjects = ['Math', 'English', 'Science', 'History']
score_arrays = [math_scores, english_scores, science_scores, history_scores]

for i, scores in enumerate(score_arrays):
    # Clip scores between 0 and 100
    score_arrays[i] = np.clip(scores, 0, 100)
    # Round to integers like real scores
    score_arrays[i] = np.round(scores).astype(int)

# Create a DataFrame with student IDs and scores
student_ids = [f"S{i+1:03d}" for i in range(n_students)]

# Create a DataFrame in "long format" for easier plotting with seaborn
exam_data_long = pd.DataFrame({
    'Student ID': np.repeat(student_ids, len(subjects)),
    'Subject': np.tile(subjects, n_students),
    'Score': np.concatenate([scores[:n_students] for scores in score_arrays])
})

# Also create a DataFrame in "wide format" for certain analyses
exam_data_wide = pd.DataFrame({
    'Student ID': student_ids,
    'Math': score_arrays[0][:n_students],
    'English': score_arrays[1][:n_students],
    'Science': score_arrays[2][:n_students],
    'History': score_arrays[3][:n_students]
})

# Add overall average score for each student
exam_data_wide['Average'] = exam_data_wide[subjects].mean(axis=1)

# Define grade boundaries
def assign_grade(score):
    if score >= 90:
        return 'A'
    elif score >= 80:
        return 'B'
    elif score >= 70:
        return 'C'
    elif score >= 60:
        return 'D'
    else:
        return 'F'

# Add grades for each subject and overall
for subject in subjects + ['Average']:
    exam_data_wide[f'{subject} Grade'] = exam_data_wide[subject].apply(assign_grade)

# Print basic statistics
print("===== EXAM SCORE DISTRIBUTION ANALYSIS =====")
print("\nBasic Statistics by Subject:")
subject_stats = exam_data_long.groupby('Subject')['Score'].describe()
print(subject_stats)

# Grade distribution
grade_counts = {}
for subject in subjects + ['Average']:
    grade_counts[subject] = exam_data_wide[f'{subject} Grade'].value_counts().sort_index()

print("\nGrade Distribution:")
for subject, counts in grade_counts.items():
    print(f"\n{subject}:")
    for grade, count in counts.items():
        percentage = (count / n_students) * 100
        print(f"{grade}: {count} students ({percentage:.1f}%)")

# Calculate additional distribution metrics
print("\nDistribution Metrics by Subject:")
for subject in subjects:
    subject_scores = exam_data_wide[subject]
    skewness = stats.skew(subject_scores)
    kurtosis = stats.kurtosis(subject_scores)
    print(f"\n{subject}:")
    print(f"Skewness: {skewness:.4f}")
    print(f"Kurtosis: {kurtosis:.4f}")
    
    # Calculate passing rate (score >= 60)
    passing_rate = (subject_scores >= 60).mean() * 100
    print(f"Passing Rate: {passing_rate:.1f}%")

# 1. Histograms of scores by subject
plt.figure(figsize=(15, 10))

# Use a 2x2 grid for the four subjects
for i, subject in enumerate(subjects):
    plt.subplot(2, 2, i+1)
    subject_data = exam_data_wide[subject]
    
    # Create the histogram with KDE
    sns.histplot(subject_data, bins=20, kde=True)
    
    # Add vertical lines for mean and median
    plt.axvline(subject_data.mean(), color='red', linestyle='--', 
                label=f'Mean: {subject_data.mean():.1f}')
    plt.axvline(subject_data.median(), color='green', linestyle='--', 
                label=f'Median: {subject_data.median():.1f}')
    
    # Add grade boundaries with different colors
    grade_bounds = [60, 70, 80, 90]
    grade_labels = ['D', 'C', 'B', 'A']
    colors = ['#ffcccc', '#ffffcc', '#ccffcc', '#ccccff']
    
    # Fill areas for different grades
    for j in range(len(grade_bounds)):
        if j == 0:
            plt.axvspan(0, grade_bounds[j], alpha=0.2, color='#ffcccc', label='F')
        if j < len(grade_bounds) - 1:
            plt.axvspan(grade_bounds[j], grade_bounds[j+1], alpha=0.2, color=colors[j+1], 
                      label=grade_labels[j])
        else:
            plt.axvspan(grade_bounds[j], 100, alpha=0.2, color=colors[j], 
                      label=grade_labels[j])
    
    plt.title(f'{subject} Score Distribution')
    plt.xlabel('Score')
    plt.ylabel('Number of Students')
    plt.xlim(0, 100)
    plt.legend()

plt.tight_layout()
plt.savefig('exam_score_distributions.png')
plt.show()

# 2. Box plots for comparing distributions across subjects
plt.figure(figsize=(12, 6))
sns.boxplot(x='Subject', y='Score', data=exam_data_long, palette='Set3')
plt.title('Score Distribution Comparison Across Subjects')
plt.xlabel('Subject')
plt.ylabel('Score')
plt.grid(True, alpha=0.3)
plt.savefig('exam_score_boxplots.png')
plt.show()

# 3. Violin plots for more detailed distribution comparison
plt.figure(figsize=(12, 6))
sns.violinplot(x='Subject', y='Score', data=exam_data_long, palette='Set3', inner='quartile')
plt.title('Detailed Score Distribution by Subject (Violin Plot)')
plt.xlabel('Subject')
plt.ylabel('Score')
plt.grid(True, alpha=0.3)
plt.savefig('exam_score_violinplots.png')
plt.show()

# 4. Grade distribution bar chart
plt.figure(figsize=(14, 8))

# Prepare data for stacked bar chart
grade_data = pd.DataFrame({subject: grade_counts[subject] for subject in subjects + ['Average']})
grade_data = grade_data.fillna(0)

# Create a stacked bar chart
grade_data.plot(kind='bar', stacked=False, figsize=(14, 8))
plt.title('Grade Distribution by Subject')
plt.xlabel('Grade')
plt.ylabel('Number of Students')
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3, axis='y')
plt.legend(title='Subject')

plt.tight_layout()
plt.savefig('exam_grade_distribution.png')
plt.show()

# 5. Kernel Density Estimation (KDE) plot for all subjects
plt.figure(figsize=(12, 6))
for subject in subjects:
    sns.kdeplot(exam_data_wide[subject], label=subject, fill=True, alpha=0.3)
plt.title('Score Density Distribution by Subject')
plt.xlabel('Score')
plt.ylabel('Density')
plt.grid(True, alpha=0.3)
plt.legend()
plt.savefig('exam_score_kde.png')
plt.show()

# 6. Correlation between subjects (heatmap)
plt.figure(figsize=(10, 8))
correlation_matrix = exam_data_wide[subjects].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Between Subject Scores')
plt.tight_layout()
plt.savefig('exam_score_correlation.png')
plt.show()

# 7. Cumulative distribution function (CDF)
plt.figure(figsize=(12, 6))
for subject in subjects:
    # Sort the data
    sorted_data = np.sort(exam_data_wide[subject])
    # Calculate the proportional values of samples
    p = 1. * np.arange(len(sorted_data)) / (len(sorted_data) - 1)
    plt.plot(sorted_data, p, label=subject)

plt.title('Cumulative Distribution of Scores by Subject')
plt.xlabel('Score')
plt.ylabel('Cumulative Probability')
plt.grid(True, alpha=0.3)
plt.legend()
plt.savefig('exam_score_cdf.png')
plt.show()

# 8. Create bins for score ranges and analyze distribution
score_bins = [0, 20, 40, 60, 70, 80, 90, 100]
bin_labels = ['0-20', '21-40', '41-60', '61-70', '71-80', '81-90', '91-100']

for subject in subjects:
    exam_data_wide[f'{subject} Range'] = pd.cut(
        exam_data_wide[subject], 
        bins=score_bins, 
        labels=bin_labels,
        include_lowest=True
    )

# Count by score range for each subject
range_distributions = {}
for subject in subjects:
    range_distributions[subject] = exam_data_wide[f'{subject} Range'].value_counts().sort_index()

# Plot score range distribution
plt.figure(figsize=(14, 8))
range_dist_df = pd.DataFrame(range_distributions)
range_dist_df.plot(kind='bar', figsize=(14, 8))
plt.title('Score Range Distribution by Subject')
plt.xlabel('Score Range')
plt.ylabel('Number of Students')
plt.grid(True, alpha=0.3, axis='y')
plt.legend(title='Subject')
plt.tight_layout()
plt.savefig('exam_score_ranges.png')
plt.show()

# 9. Normal probability plot (QQ plot) for each subject
plt.figure(figsize=(15, 10))
for i, subject in enumerate(subjects):
    plt.subplot(2, 2, i+1)
    stats.probplot(exam_data_wide[subject], plot=plt)
    plt.title(f'QQ Plot for {subject} Scores')
plt.tight_layout()
plt.savefig('exam_score_qqplots.png')
plt.show()

# Optional: Save the exam data to CSV files
exam_data_wide.to_csv('exam_data_wide.csv', index=False)
exam_data_long.to_csv('exam_data_long.csv', index=False)

print("\nExam score distribution analysis complete! Visualizations and data summary saved.")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime, timedelta
import matplotlib.ticker as ticker
import matplotlib.dates as mdates

# Set the visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Generate sample order quantity data
np.random.seed(42)  # For reproducibility

# Create a dataset with order information
n_orders = 1000
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 4, 30)

# Generate random dates between start and end date
days_range = (end_date - start_date).days
random_days = np.random.randint(0, days_range, n_orders)
order_dates = [start_date + timedelta(days=day) for day in random_days]

# Sort dates chronologically
order_dates.sort()

# Generate product categories with different order quantity patterns
categories = ['Electronics', 'Clothing', 'Food', 'Home Goods', 'Office Supplies']
category_probs = [0.15, 0.25, 0.3, 0.2, 0.1]  # Probability of each category

# Sample categories based on probability
product_categories = np.random.choice(categories, n_orders, p=category_probs)

# Generate order quantities with different distributions based on category
order_quantities = []

for category in product_categories:
    if category == 'Electronics':
        # Electronics typically have lower quantities per order
        quantity = max(1, int(np.random.lognormal(1.1, 0.8)))
    elif category == 'Clothing':
        # Clothing might have medium quantities
        quantity = max(1, int(np.random.normal(3, 2)))
    elif category == 'Food':
        # Food items might have higher quantities
        quantity = max(1, int(np.random.gamma(5, 1)))
    elif category == 'Home Goods':
        # Home goods with medium to low quantities
        quantity = max(1, int(np.random.poisson(2)))
    else:  # Office Supplies
        # Office supplies with more varied quantities
        quantity = max(1, int(np.random.exponential(5)))
    
    order_quantities.append(quantity)

# Create the main DataFrame
order_data = pd.DataFrame({
    'OrderDate': order_dates,
    'Category': product_categories,
    'Quantity': order_quantities
})

# Add some additional features for analysis
order_data['Month'] = order_data['OrderDate'].dt.month_name()
order_data['DayOfWeek'] = order_data['OrderDate'].dt.day_name()
order_data['WeekOfYear'] = order_data['OrderDate'].dt.isocalendar().week

# Print basic statistics
print("===== ORDER QUANTITY DISTRIBUTION ANALYSIS =====")
print("\nBasic Statistics for All Orders:")
print(order_data['Quantity'].describe())

# Statistics by category
print("\nOrder Quantity Statistics by Product Category:")
category_stats = order_data.groupby('Category')['Quantity'].describe()
print(category_stats)

# Calculate additional distribution metrics by category
print("\nDistribution Metrics by Category:")
for category in categories:
    category_qty = order_data[order_data['Category'] == category]['Quantity']
    skewness = stats.skew(category_qty)
    kurtosis = stats.kurtosis(category_qty)
    print(f"\n{category}:")
    print(f"Skewness: {skewness:.4f}")
    print(f"Kurtosis: {kurtosis:.4f}")
    print(f"Coefficient of Variation: {(category_qty.std() / category_qty.mean()) * 100:.2f}%")

# 1. Overall distribution of order quantities
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.histplot(order_data['Quantity'], bins=30, kde=True)
plt.title('Distribution of Order Quantities (All Products)')
plt.xlabel('Order Quantity')
plt.ylabel('Frequency')
plt.axvline(order_data['Quantity'].mean(), color='red', linestyle='--', 
            label=f'Mean: {order_data["Quantity"].mean():.2f}')
plt.axvline(order_data['Quantity'].median(), color='green', linestyle='--', 
            label=f'Median: {order_data["Quantity"].median():.2f}')
plt.legend()

# 2. Log-scaled histogram for better visibility with skewed data
plt.subplot(2, 2, 2)
sns.histplot(order_data['Quantity'], bins=30, kde=True, log_scale=(False, True))
plt.title('Order Quantity Distribution (Log Frequency Scale)')
plt.xlabel('Order Quantity')
plt.ylabel('Frequency (Log Scale)')

# 3. Box plot by category
plt.subplot(2, 2, 3)
sns.boxplot(x='Category', y='Quantity', data=order_data, palette='Set3')
plt.title('Order Quantity by Product Category')
plt.xlabel('Product Category')
plt.ylabel('Order Quantity')
plt.xticks(rotation=45)

# 4. Violin plot by category
plt.subplot(2, 2, 4)
sns.violinplot(x='Category', y='Quantity', data=order_data, palette='Set3', inner='quartile')
plt.title('Detailed Order Quantity Distribution by Category')
plt.xlabel('Product Category')
plt.ylabel('Order Quantity')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('order_quantity_distribution.png')
plt.show()

# 5. Time-based analysis
plt.figure(figsize=(15, 8))

# Resample by week for time series analysis
weekly_orders = order_data.groupby(pd.Grouper(key='OrderDate', freq='W'))['Quantity'].agg(['count', 'sum', 'mean', 'median', 'std'])
weekly_orders = weekly_orders.reset_index()

# Plot weekly order quantities
plt.subplot(2, 1, 1)
plt.plot(weekly_orders['OrderDate'], weekly_orders['sum'], marker='o', linestyle='-')
plt.title('Weekly Total Order Quantity')
plt.xlabel('Week')
plt.ylabel('Total Quantity')
plt.grid(True, alpha=0.3)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
plt.xticks(rotation=45)

# Plot weekly average order quantity
plt.subplot(2, 1, 2)
plt.plot(weekly_orders['OrderDate'], weekly_orders['mean'], marker='o', linestyle='-', color='green')
plt.fill_between(
    weekly_orders['OrderDate'],
    weekly_orders['mean'] - weekly_orders['std'],
    weekly_orders['mean'] + weekly_orders['std'],
    alpha=0.2,
    color='green'
)
plt.title('Weekly Average Order Quantity (with Standard Deviation)')
plt.xlabel('Week')
plt.ylabel('Average Quantity')
plt.grid(True, alpha=0.3)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('order_quantity_time_analysis.png')
plt.show()

# 6. Day of week distribution
plt.figure(figsize=(14, 6))

# Order the days correctly
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

daily_avg = order_data.groupby('DayOfWeek')['Quantity'].mean().reindex(day_order)
daily_count = order_data.groupby('DayOfWeek')['Quantity'].count().reindex(day_order)

# Dual axis plot
fig, ax1 = plt.subplots(figsize=(14, 6))

# Plot average quantity on primary y-axis
color = 'tab:blue'
ax1.set_xlabel('Day of Week')
ax1.set_ylabel('Average Order Quantity', color=color)
ax1.bar(daily_avg.index, daily_avg.values, color=color, alpha=0.2)



# 3. Data Type Validation:
# Task 1: Validate numeric fields in a dataset to ensure they contain only numbers.
# Task 2: Check for valid date formats in a transaction log.
# Task 3: Validate email formats in a customer contact dataset.
import pandas as pd

# Load dataset
df = pd.read_csv('data.csv')

# Check if 'amount' column is numeric
is_numeric = pd.to_numeric(df['amount'], errors='coerce')

# Identify invalid entries (those that became NaN)
invalid_numeric = df[is_numeric.isna() & df['amount'].notna()]

print(f"Number of invalid numeric entries: {len(invalid_numeric)}")
print("Invalid numeric entries:\n", invalid_numeric)
# Convert 'transaction_date' to datetime
df['transaction_date_valid'] = pd.to_datetime(df['transaction_date'], errors='coerce')

# Identify rows with invalid date formats
invalid_dates = df[df['transaction_date_valid'].isna() & df['transaction_date'].notna()]

print(f"Number of invalid dates: {len(invalid_dates)}")
print("Invalid date entries:\n", invalid_dates)
import re

# Simple regex for email validation
email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'

# Apply regex check
invalid_emails = df[~df['email'].astype(str).str.match(email_pattern, na=False)]

print(f"Number of invalid emails: {len(invalid_emails)}")
print("Invalid email entries:\n", invalid_emails[['email']])







