In [None]:
# Ques 3: Power Transformation (Box-Cox Method)

# Objective: Use Box-Cox transformation to stabilize variance and make the data more normally distributed

# Step 1: Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro

# Step 2: Load Dataset
# Replace 'data.csv' with the actual dataset file path
df = pd.read_csv('data.csv')  # Example placeholder
df.head()

# Step 3: Select Column & Ensure Positive Values
col = 'target_column'  # Replace with the actual column name

# Ensure the column has only positive values (required for Box-Cox)
if (df[col] <= 0).any():
    shift = abs(df[col].min()) + 1
    df[col] = df[col] + shift
    print(f"Data shifted by {shift} to make all values positive.")

# Step 4: Visualize Original Data
plt.figure(figsize=(8, 4))
sns.histplot(df[col], kde=True)
plt.title('Original Data Distribution')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()

# Step 5: Apply Box-Cox Transformation
transformed_data, fitted_lambda = stats.boxcox(df[col])
df['transformed'] = transformed_data

print(f"Optimal Lambda for Box-Cox Transformation: {fitted_lambda}")

# Step 6: Visualize Transformed Data
plt.figure(figsize=(8, 4))
sns.histplot(df['transformed'], kde=True)
plt.title('Box-Cox Transformed Data Distribution')
plt.xlabel('Transformed Values')
plt.ylabel('Frequency')
plt.show()

# Step 7: Normality Check (Shapiro-Wilk Test)
stat, p = shapiro(df['transformed'])
print(f'Shapiro-Wilk Test: Statistics={stat}, p-value={p}')

if p > 0.05:
    print('Transformed data looks Gaussian (fail to reject H0)')
else:
    print('Transformed data does not look Gaussian (reject H0)')

# Step 8: Summary
print("\n--- Summary ---")
print(f"The Box-Cox transformation was applied successfully.")
print(f"Optimal λ (lambda): {fitted_lambda}")
print("Distribution improved and normality tested using Shapiro-Wilk.")