# DS623 PE08 Sumary Statistics of a Dataframe

Student: Verónica Elze

## Imports

In [1]:
import pandas as pd
import numpy as np

## Create Dataframe

In [2]:
# Set seed for reproducibility
np.random.seed(42)

# Create realistic sample data of women
age = np.random.randint(45, 56, size=10)          # Ages 45 to 55
height = np.random.randint(58, 70, size=10)       # Heights 58 to 69 inches
weight = np.random.randint(135, 176, size=10)     # Weights 135 to 175 lbs

# Create DataFrame
df = pd.DataFrame({
    'height (in)': height,
    'weight (lbs)': weight,
    'age (yrs)': age
})

df.head()

Unnamed: 0,height (in),weight (lbs),age (yrs)
0,68,158,51
1,65,164,48
2,62,172,55
3,61,136,52
4,65,155,49


## Mean Function

In [3]:
# Function to compute mean
def compute_mean(series):
    return sum(series) / len(series)

## Variance Function

In [4]:
# Function to compute variance (sample variance with ddof=1)
def compute_variance(series):
    mean = compute_mean(series)
    
    # Sanity check: print squared differences
    print(f"\nSanity Check for {series.name}:\nMean = {mean}")
    for x in series:
        diff_squared = (x - mean) ** 2
        print(f"Value: {x}, (x - mean)^2: {diff_squared}")
    
    return sum((x - mean) ** 2 for x in series) / (len(series) - 1)

## Computation of Mean & Variance

In [5]:
# Compute and display results
results = {}
for column in df.columns:
    mean = compute_mean(df[column])
    var = compute_variance(df[column])
    results[column] = {'mean': round(mean, 4), 'variance': round(var, 4)}


Sanity Check for height (in):
Mean = 63.0
Value: 68, (x - mean)^2: 25.0
Value: 65, (x - mean)^2: 4.0
Value: 62, (x - mean)^2: 1.0
Value: 61, (x - mean)^2: 4.0
Value: 65, (x - mean)^2: 4.0
Value: 65, (x - mean)^2: 4.0
Value: 60, (x - mean)^2: 9.0
Value: 63, (x - mean)^2: 0.0
Value: 62, (x - mean)^2: 1.0
Value: 59, (x - mean)^2: 16.0

Sanity Check for weight (lbs):
Mean = 157.4
Value: 158, (x - mean)^2: 0.35999999999999316
Value: 164, (x - mean)^2: 43.559999999999924
Value: 172, (x - mean)^2: 213.15999999999983
Value: 136, (x - mean)^2: 457.96000000000026
Value: 155, (x - mean)^2: 5.760000000000027
Value: 167, (x - mean)^2: 92.1599999999999
Value: 146, (x - mean)^2: 129.96000000000012
Value: 156, (x - mean)^2: 1.960000000000016
Value: 159, (x - mean)^2: 2.559999999999982
Value: 161, (x - mean)^2: 12.959999999999958

Sanity Check for age (yrs):
Mean = 51.3
Value: 51, (x - mean)^2: 0.08999999999999829
Value: 48, (x - mean)^2: 10.889999999999981
Value: 55, (x - mean)^2: 13.69000000000002
V

## Output

In [6]:
print("Summary Statistics (using formulas and rounded to 4 decimal places):")
for var in results:
    print(f"{var.capitalize()} - Mean: {results[var]['mean']}, Variance: {results[var]['variance']}")

Summary Statistics (using formulas and rounded to 4 decimal places):
Height (in) - Mean: 63.0, Variance: 7.5556
Weight (lbs) - Mean: 157.4, Variance: 106.7111
Age (yrs) - Mean: 51.3, Variance: 7.7889


## Covariance Matrix (optional)

In [7]:
# Function to compute covariance matrix manually
def compute_covariance_matrix(df):
    columns = df.columns
    cov_matrix = np.zeros((len(columns), len(columns)))
    for i in range(len(columns)):
        for j in range(len(columns)):
            xi = df[columns[i]]
            xj = df[columns[j]]
            mean_xi = compute_mean(xi)
            mean_xj = compute_mean(xj)
            cov = sum((xi[k] - mean_xi) * (xj[k] - mean_xj) for k in range(len(xi))) / (len(xi) - 1)
            cov_matrix[i][j] = cov
    return pd.DataFrame(cov_matrix, columns=columns, index=columns)

In [8]:
# Covariance matrix
cov_matrix = compute_covariance_matrix(df)
formatted_cov_matrix = cov_matrix.round(4)

print("\nCovariance Matrix (rounded to 4 decimal places):")
print(cov_matrix)


Covariance Matrix (rounded to 4 decimal places):
              height (in)  weight (lbs)  age (yrs)
height (in)      7.555556      8.555556  -4.555556
weight (lbs)     8.555556    106.711111   0.866667
age (yrs)       -4.555556      0.866667   7.788889
