In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# For reproducibility
np.random.seed(42)


In [None]:
# Generate sample data
data = np.random.normal(loc=50, scale=15, size=1000)

# Calculate basic statistics
mean_val = np.mean(data)
median_val = np.median(data)
std_val = np.std(data)
var_val = np.var(data)
min_val = np.min(data)
max_val = np.max(data)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1

# Display results
print(f"Mean: {mean_val:.2f}")
print(f"Median: {median_val:.2f}")
print(f"Standard Deviation: {std_val:.2f}")
print(f"Variance: {var_val:.2f}")
print(f"Min: {min_val:.2f}")
print(f"Max: {max_val:.2f}")
print(f"Q1 (25th percentile): {q1:.2f}")
print(f"Q3 (75th percentile): {q3:.2f}")
print(f"IQR: {iqr:.2f}")


In [None]:
# Visualize the distribution
plt.figure(figsize=(12, 6))

# Histogram with KDE
plt.subplot(1, 2, 1)
sns.histplot(data, kde=True, color='blue')
plt.axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.2f}')
plt.axvline(median_val, color='green', linestyle='-.', label=f'Median: {median_val:.2f}')
plt.title('Distribution of Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()

# Box plot
plt.subplot(1, 2, 2)
sns.boxplot(y=data)
plt.title('Box Plot')
plt.ylabel('Value')

plt.tight_layout()
plt.show()


In [None]:
# Generate data from different distributions
x = np.linspace(-5, 5, 1000)

# Normal/Gaussian distribution
normal_dist = stats.norm.pdf(x, loc=0, scale=1)

# Student's t-distribution
t_dist = stats.t.pdf(x, df=5)

# Uniform distribution
uniform_x = np.linspace(-3, 3, 1000)
uniform_dist = stats.uniform.pdf(uniform_x, loc=-2, scale=4)

# Exponential distribution
exp_x = np.linspace(0, 5, 1000)
exp_dist = stats.expon.pdf(exp_x, scale=1)

# Plot distributions
plt.figure(figsize=(14, 10))

plt.subplot(2, 2, 1)
plt.plot(x, normal_dist, 'b-', lw=2)
plt.title('Normal Distribution')
plt.xlabel('x')
plt.ylabel('Probability Density')

plt.subplot(2, 2, 2)
plt.plot(x, t_dist, 'r-', lw=2)
plt.title("Student's t-Distribution (df=5)")
plt.xlabel('x')
plt.ylabel('Probability Density')

plt.subplot(2, 2, 3)
plt.plot(uniform_x, uniform_dist, 'g-', lw=2)
plt.title('Uniform Distribution')
plt.xlabel('x')
plt.ylabel('Probability Density')

plt.subplot(2, 2, 4)
plt.plot(exp_x, exp_dist, 'y-', lw=2)
plt.title('Exponential Distribution')
plt.xlabel('x')
plt.ylabel('Probability Density')

plt.tight_layout()
plt.show()


In [None]:
# Generate two samples
group_a = np.random.normal(loc=50, scale=10, size=100)
group_b = np.random.normal(loc=55, scale=10, size=100)

# Perform t-test
t_stat, p_value = stats.ttest_ind(group_a, group_b)

print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Statistically significant difference (α=0.05): {p_value < 0.05}")

# Visualize the two distributions
plt.figure(figsize=(10, 6))
sns.histplot(group_a, color='blue', label='Group A', kde=True, alpha=0.6)
sns.histplot(group_b, color='red', label='Group B', kde=True, alpha=0.6)
plt.axvline(np.mean(group_a), color='blue', linestyle='--', label=f'Mean A: {np.mean(group_a):.2f}')
plt.axvline(np.mean(group_b), color='red', linestyle='--', label=f'Mean B: {np.mean(group_b):.2f}')
plt.title('Comparison of Two Groups')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
# Generate correlated data
n = 200
x = np.random.normal(0, 1, n)

# Different levels of correlation
y_strong_pos = x * 0.9 + np.random.normal(0, 0.3, n)  # Strong positive
y_weak_pos = x * 0.3 + np.random.normal(0, 0.9, n)    # Weak positive
y_strong_neg = -x * 0.9 + np.random.normal(0, 0.3, n) # Strong negative
y_no_corr = np.random.normal(0, 1, n)                # No correlation

# Calculate correlations and covariances
corr_strong_pos = np.corrcoef(x, y_strong_pos)[0, 1]
cov_strong_pos = np.cov(x, y_strong_pos)[0, 1]

corr_weak_pos = np.corrcoef(x, y_weak_pos)[0, 1]
cov_weak_pos = np.cov(x, y_weak_pos)[0, 1]

corr_strong_neg = np.corrcoef(x, y_strong_neg)[0, 1]
cov_strong_neg = np.cov(x, y_strong_neg)[0, 1]

corr_no_corr = np.corrcoef(x, y_no_corr)[0, 1]
cov_no_corr = np.cov(x, y_no_corr)[0, 1]

# Print results
print(f"Strong positive correlation: {corr_strong_pos:.4f}, covariance: {cov_strong_pos:.4f}")
print(f"Weak positive correlation: {corr_weak_pos:.4f}, covariance: {cov_weak_pos:.4f}")
print(f"Strong negative correlation: {corr_strong_neg:.4f}, covariance: {cov_strong_neg:.4f}")
print(f"No correlation: {corr_no_corr:.4f}, covariance: {cov_no_corr:.4f}")


In [None]:
# Visualize correlations
plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
plt.scatter(x, y_strong_pos, alpha=0.6)
plt.title(f'Strong Positive Correlation (r = {corr_strong_pos:.4f})')
plt.xlabel('X')
plt.ylabel('Y')

plt.subplot(2, 2, 2)
plt.scatter(x, y_weak_pos, alpha=0.6)
plt.title(f'Weak Positive Correlation (r = {corr_weak_pos:.4f})')
plt.xlabel('X')
plt.ylabel('Y')

plt.subplot(2, 2, 3)
plt.scatter(x, y_strong_neg, alpha=0.6)
plt.title(f'Strong Negative Correlation (r = {corr_strong_neg:.4f})')
plt.xlabel('X')
plt.ylabel('Y')

plt.subplot(2, 2, 4)
plt.scatter(x, y_no_corr, alpha=0.6)
plt.title(f'No Correlation (r = {corr_no_corr:.4f})')
plt.xlabel('X')
plt.ylabel('Y')

plt.tight_layout()
plt.show()


In [None]:
# Generate data for two models' performance
np.random.seed(42)
model_a_scores = np.random.normal(loc=0.75, scale=0.05, size=30)
model_b_scores = np.random.normal(loc=0.78, scale=0.05, size=30)

# Perform t-test to compare models
t_stat, p_value = stats.ttest_ind(model_a_scores, model_b_scores)

# Print results
print(f"Model A average score: {np.mean(model_a_scores):.4f} ± {np.std(model_a_scores):.4f}")
print(f"Model B average score: {np.mean(model_b_scores):.4f} ± {np.std(model_b_scores):.4f}")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret results
alpha = 0.05
if p_value < alpha:
    print(f"Reject null hypothesis: There is a statistically significant difference between models (p < {alpha})")
else:
    print(f"Fail to reject null hypothesis: No statistically significant difference between models (p >= {alpha})")

# Visualize comparison
plt.figure(figsize=(10, 6))
box_data = [model_a_scores, model_b_scores]
sns.boxplot(data=box_data)
plt.xticks([0, 1], ['Model A', 'Model B'])
plt.ylabel('Performance Score')
plt.title('Model Performance Comparison')
plt.show()


In [None]:
# Create a simple dataset for linear regression
np.random.seed(42)
X = np.random.uniform(0, 10, 100).reshape(-1, 1)
y = 2 * X.flatten() + 5 + np.random.normal(0, 2, 100)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate errors
train_errors = y_train - y_pred_train
test_errors = y_test - y_pred_test

# Calculate statistical metrics
from sklearn.metrics import mean_squared_error, r2_score

train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Model coefficients: {model.coef_}")
print(f"Model intercept: {model.intercept_}")
print(f"Training MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")


In [None]:
# Visualize the linear regression model and error distribution
plt.figure(figsize=(15, 10))

# Plot the regression line
plt.subplot(2, 2, 1)
plt.scatter(X_train, y_train, alpha=0.6, label='Training data')
plt.scatter(X_test, y_test, alpha=0.6, label='Test data')
plt.plot(X_train, y_pred_train, color='red', linewidth=2, label='Regression line')
plt.title('Linear Regression Model')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()

# Plot error distributions
plt.subplot(2, 2, 2)
sns.histplot(train_errors, kde=True, color='blue', label='Training errors')
sns.histplot(test_errors, kde=True, color='red', label='Test errors')
plt.title('Error Distribution')
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.legend()

# Plot residuals
plt.subplot(2, 2, 3)
plt.scatter(y_pred_train, train_errors, alpha=0.6, label='Training')
plt.scatter(y_pred_test, test_errors, alpha=0.6, label='Test')
plt.axhline(y=0, color='red', linestyle='-')
plt.title('Residual Plot')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend()

# QQ plot to check normality of residuals
plt.subplot(2, 2, 4)
stats.probplot(np.concatenate([train_errors, test_errors]), plot=plt)
plt.title('Q-Q Plot of Residuals')

plt.tight_layout()
plt.show()
