## Data Simulation

In [2]:
import numpy as np

rng = np.random.default_rng(42)

In [3]:
n = 10000
ages = rng.integers(18, 91, n)

# Heights
heihts = rng.normal(loc=170, scale=7, size=n)
heihts = np.clip(heihts, 150, 200)
heights = np.round(heihts, 1)

# Weights
bmi_mean = 24.0
weights = bmi_mean * (heights / 100) ** 2 + rng.normal(0, 3.0, n)
weights = np.clip(weights, 45, 150)
weights = np.round(weights, 1)

# Income
incomes = rng.lognormal(mean=np.log(50000), sigma=0.9, size=n)
incomes = np.clip(incomes, 10000, 200000)
incomes = np.round(incomes, 2)

In [4]:
dtype = [('Age', 'i4'), ('Height_cm', 'f4'), ('Weight_kg', 'f4'), ('Income', 'f8')]
data  = np.empty(n, dtype=dtype)

data['Age'] = ages
data['Height_cm'] = heights
data['Weight_kg'] = weights
data['Income'] = incomes

print(data[:100])

[(24, 162.3, 60.4,  31263.85) (74, 166.2, 66.1,  50749.03)
 (65, 171.9, 69.8,  91443.  ) (50, 165.6, 68.3, 106980.41)
 (49, 168.1, 70.4,  66586.8 ) (80, 173.1, 66.4,  18906.83)
 (24, 167.6, 70.6,  10000.  ) (68, 171.6, 68.8,  42218.67)
 (32, 181.1, 79.4, 119842.62) (24, 168.8, 65.6,  74944.71)
 (56, 178.3, 77.4,  62168.77) (89, 173.9, 80.2,  49684.8 )
 (71, 159.7, 61.6,  66036.97) (73, 162.3, 61.8,  20436.88)
 (70, 164.2, 64. , 100907.54) (75, 164. , 60.1, 107093.13)
 (55, 169.4, 73.2,  36316.93) (27, 174.3, 73.9,  74093.65)
 (79, 178.8, 73.6,  10666.72) (50, 172.2, 68.1,  58795.55)
 (54, 166.9, 66.9,  44354.33) (45, 155.4, 60.8, 155295.95)
 (31, 170.5, 66.3,  41814.56) (85, 170.2, 72.9, 109269.91)
 (75, 171.3, 70.1,  35162.06) (65, 184.7, 79.1,  10000.  )
 (47, 178.7, 75.2,  40814.62) (78, 175.5, 79.2,  59686.42)
 (57, 163.3, 59.8,  57269.99) (50, 166.4, 64.3,  58054.28)
 (50, 169.3, 71.6,  31696.75) (34, 164.6, 65.7,  37362.52)
 (24, 173.5, 69.9,  18143.52) (58, 172.5, 74.3,  10000. 

In [5]:
rows = np.column_stack((ages, heights, weights, incomes))
np.savetxt('dataSimlulation.csv',rows, delimiter=',', 
           header='Age,Height_cm,Weight_kg,Income', 
           comments='', fmt=['%d', '%.1f', '%.1f', '%.2f'])

## Basic Statistics

In [6]:
ages = data['Age']
heights = data['Height_cm']
weights = data['Weight_kg']
incomes = data['Income']

In [9]:
# Mean
print("Mean Age:", np.mean(ages))
print("Mean Height:", np.mean(heights))
print("Mean Weight:", np.mean(weights))
print("Mean Income:", np.mean(incomes))

print("-" * 30)

# Median
print("Median Age:", np.median(ages))
print("Median Height:", np.median(heights))
print("Median Weight:", np.median(weights))
print("Median Income:", np.median(incomes))

print("-" * 30)

# Standard Deviation
print("Height Variance:", np.var(heights))
print("Height Standard Deviation:", np.std(heights))

Mean Age: 53.7376
Mean Height: 169.9806
Mean Weight: 69.565796
Mean Income: 68214.606163
------------------------------
Median Age: 54.0
Median Height: 169.9
Median Weight: 69.5
Median Income: 50163.455
------------------------------
Height Variance: 49.34279
Height Standard Deviation: 7.024442


In [10]:
corr_mtx = np.corrcoef(heights, weights)
print("Correlation matrix (Height vs Weight):\n", corr_mtx)
print("Correlation coefficient (Height vs Weight):", corr_mtx[0, 1])

Correlation matrix (Height vs Weight):
 [[1.        0.8847048]
 [0.8847048 1.       ]]
Correlation coefficient (Height vs Weight): 0.8847047953910524


In [17]:
# Top 10 richest peoples

top10_idx = np.argsort(incomes)[-10:]
print("Top 10 Incomes:\n", incomes[top10_idx])

# Corresponding ages, heights, weights
print("Corresponding Ages:", ages[top10_idx])
print("Corresponding Heights:", heights[top10_idx])
print("Corresponding Weights:", weights[top10_idx])


Top 10 Incomes:
 [200000. 200000. 200000. 200000. 200000. 200000. 200000. 200000. 200000.
 200000.]
Corresponding Ages: [71 73 19 59 36 49 28 73 43 59]
Corresponding Heights: [159.2 156.6 184.3 173.  161.2 160.1 158.9 161.2 173.4 178.6]
Corresponding Weights: [62.1 56.7 82.5 72.  64.  64.7 57.2 65.9 71.  81.9]


In [18]:
summary = np.column_stack((ages, heights, weights, incomes))

means = np.mean(summary, axis=0)
medians = np.median(summary, axis=0)
min = np.min(summary, axis=0)
max = np.max(summary, axis=0)

print("Means      :", means)
print("Medians    :", medians)
print("Min values :", min)
print("Max values :", max)

Means      : [5.37376000e+01 1.69980610e+02 6.95657900e+01 6.82146062e+04]
Medians    : [   54.          169.8999939    69.5       50163.455    ]
Min values : [   18.   150.    45. 10000.]
Max values : [9.00000000e+01 1.99100006e+02 9.60000000e+01 2.00000000e+05]
