# **Statistics Homework Assignment**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import warnings
warnings.filterwarnings('ignore')

# Question 1: Basic Measures of Central Tendency and Dispersion (Beginner) 

Part A (4 points): Calculate the mean, median, and mode for this dataset. 

In [None]:
data = [45, 52, 38, 47, 55, 41, 49]

# Convert list to pandas Series
df = pd.Series(data, name="Customers")
print(df)

0    45
1    52
2    38
3    47
4    55
5    41
6    49
Name: Customers, dtype: int64


In [None]:
# === Part A: Mean, Median, Mode ===
mean = df.mean()
median = df.median()
mode = df.mode()

print("Basic Measures of Central Tendency ")
print(f"Mean = {mean:.3f}")
print(f"Median = {median}")
if len(mode) == len(df):  
    print("Mode = No mode (all values unique)")
else:
    print(f"Mode = {mode.tolist()}")


Basic Measures of Central Tendency 
Mean = 46.714
Median = 47.0
Mode = No mode (all values unique)


Part B (3 points): Calculate the sample variance and sample standard deviation. Show your work step by step.

In [None]:
# Part B: Sample Variance and Standard Deviation 
df_deviation = df - mean
df_squared = df_deviation ** 2
sum_squared = df_squared.sum()
n = len(df)
sample_variance = sum_squared / (n - 1)
sample_std = sample_variance ** 0.5

# Combine into a DataFrame for step-by-step display
steps = pd.DataFrame({
    "Value": df,
    "Deviation (x - mean)": df_deviation,
    "Squared Deviation": df_squared
})

print("\n Variance & Std. Deviation")
print(f"\nSum of Squared Deviations = {sum_squared:.3f}")
print(f"Sample Variance = {sample_variance:.3f}")
print(f"Sample Standard Deviation = {sample_std:.3f}")


 Variance & Std. Deviation

Sum of Squared Deviations = 213.429
Sample Variance = 35.571
Sample Standard Deviation = 5.964


Part C (3 points): Explain which measure of central tendency (mean, median, or mode) would be most appropriate if there was an outlier day with 150 customers, and why?

In [None]:
data_outlier = data + [150] 
df_outlier = pd.Series(data_outlier, name="Customers with Outlier")

mean_outlier = df_outlier.mean()
median_outlier = df_outlier.median()
mode_outlier = df_outlier.mode()

print("\n Effect of Outlier")
print(f"New Data: {data_outlier}")
print(f"New Mean = {mean_outlier:.3f}")
print(f"New Median = {median_outlier}")
print(f"New Mode = {mode_outlier.tolist() if len(mode_outlier) < len(df_outlier) else 'No mode'}")



 Effect of Outlier
New Data: [45, 52, 38, 47, 55, 41, 49, 150]
New Mean = 59.625
New Median = 48.0
New Mode = No mode


# Question 2: Population vs Sample and Variance Division (Beginner-Intermediate) 

Part A (4 points): Explain the difference between population and sample in this context. Why might the manager choose to work with a sample rather than the entire population? 

In [None]:

np.random.seed(42)
population_weights = np.random.normal(loc=100, scale=5, size=10000)


population_df = pd.DataFrame(population_weights, columns=["Weight (g)"])

# === Sample ===
sample_df = population_df.sample(n=50, random_state=1)

# === Statistics ===
population_mean = population_df["Weight (g)"].mean()
population_std = population_df["Weight (g)"].std()
sample_mean = sample_df["Weight (g)"].mean()
sample_std = sample_df["Weight (g)"].std()
# === Results ===
print("=== Population vs Sample Comparison ===")
print(f"Population size = {len(population_df)}")
print(f"Sample size = {len(sample_df)}\n")

print(f"Population Mean (μ) = {population_mean:.2f} g")
print(f"Population Std  (σ) = {population_std:.2f} g")
print(f"Sample Mean (x̄) = {sample_mean:.2f} g")
print(f"Sample Std  (s) = {sample_std:.2f} g")



=== Population vs Sample Comparison ===
Population size = 10000
Sample size = 50

Population Mean (μ) = 99.99 g
Population Std Dev (σ) = 5.02 g
Sample Mean (x̄) = 99.89 g
Sample Std Dev (s) = 4.79 g


# Question 3: Variables and Measurement Scales(Beginner-Intermediate)

Part B (5 points): The manager takes a sample of 8 chocolate bars with weights: 98, 102, 95, 105, 99, 103, 97, 101g. Calculate the sample variance using the formula with (n-1) in the denominator.

In [None]:
weights = [98, 102, 95, 105, 99, 103, 97, 101]

data = pd.Series(weights)

sample_variance = data.var(ddof=1)

print("Sample Variance:", sample_variance)


Sample Variance: 11.142857142857142


# Question 7: Comprehensive Application - Five Number Summary and Data Analysis (Advanced Intermediate)

Part A: Five Number Summary (10 points)

In [10]:

data = [8500, 12000, 6500, 15500, 9200, 11800, 7300, 13400, 10500, 8900,
        14200, 5800, 12600, 9800, 16500, 7800, 11200, 8400, 13800, 10200]


df = pd.Series(data).sort_values().reset_index(drop=True)

#  Five Number Summary 
minimum = df.min()
Q1 = df.quantile(0.25)   
median = df.median()        
Q3 = df.quantile(0.75)      
maximum = df.max()

IQR = Q3 - Q1



print("Sorted Data:")
print(df.to_list(), "\n")
print("Q1 (25%):", Q1)
print("Median (50%):", median)
print("Q3 (75%):", Q3)
print("Maximum:", maximum)
print("IQR:", IQR)


Sorted Data:
[5800, 6500, 7300, 7800, 8400, 8500, 8900, 9200, 9800, 10200, 10500, 11200, 11800, 12000, 12600, 13400, 13800, 14200, 15500, 16500] 

Q1 (25%): 8475.0
Median (50%): 10350.0
Q3 (75%): 12800.0
Maximum: 16500
IQR: 4325.0


Part B: Outlier Detection using IQR Method (8 points)

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


outliers = df[(df < lower_bound) | (df > upper_bound)]

# Display results
print(f"Q1 = {Q1}")
print(f"Q3 = {Q3}")
print(f"IQR = {IQR}")
print(f"Lower bound = {lower_bound}")
print(f"Upper bound = {upper_bound}")
print(f"Outliers in the dataset: {outliers.tolist()}")

Q1 = 8475.0
Q3 = 12800.0
IQR = 4325.0
Lower bound = 1987.5
Upper bound = 19287.5
Outliers in the dataset: []
