### 1. Create Synthetic data

In [2]:
import numpy as np

# Step 1 - Create synthetic dataset
np.random.seed(42) # fixed seed by which your result would be same

no_of_students = 500

# Random gender (0 = Male, 1 = Female)
gender = np.random.randint(0, 2, size=no_of_students)

# Random marks (0 to 100)
math = np.random.normal(65, 15, size = no_of_students).clip(0,100) 
# mean=65, std=15 (avg 65 ke aas paas) # .clip(0,100) ensures marks blw 0 & 100 only
science = np.random.normal(70, 12, size = no_of_students).clip(0,100)
english = np.random.normal(68, 10, size = no_of_students).clip(0,100)
sports = np.random.normal(60, 20, size = no_of_students).clip(0,100)

# Combine all subjects in a single matrix (students x 4 subjects)
scores = np.column_stack((math, science, english, sports)) # combine columns horizontally

print("Shape:", scores.shape)
print("Sample:\n", scores[:5])


Shape: (500, 4)
Sample:
 [[52.29809423 78.53151962 73.50052099 65.66575743]
 [42.27729163 75.33115974 58.31655545 54.82190053]
 [58.30227572 65.66840601 69.05375506 71.73387603]
 [77.84598191 83.91195764 54.65974505 50.50192687]
 [68.21140616 57.02724007 61.98632356 77.42594595]]


### 2. Basics Stats

In [8]:
# Mean, Median, Std
means = np.mean(scores, axis=0)
medians = np.median(scores, axis=0)
std = np.std(scores, axis=0)

print("Average marks:", means)
print("Median marks:", medians)
print("Standard deviation:", std)

Average marks: [64.8202601  71.29810564 68.64790757 59.78216158]
Median marks: [64.84835439 71.78209269 68.33043809 59.76247408]
Standard deviation: [14.88158102 11.82285525  9.97986295 19.07110078]


### 3. Total marks & Top Performers

In [12]:
# Total score per student
total = scores.sum(axis=1)
# Rank (argsort for ascending order)
rank = np.argsort(-total) # Negative for descending order

top5_idx = rank[:5]
print("\nTop 5 Students:" , top5_idx)


Top 5 Students: [124  51 186 225 246]


### 4. Broadcasting concept

In [13]:
# Suppose school gives +5 bonus marks to all subjects
bonus = np.array([5,5,5,5])
scores_bonus = scores + bonus # Broadcasting automatic
print("\nAfter bonus first row:", scores_bonus[0])


After bonus first row: [57.29809423 83.53151962 78.50052099 70.66575743]


### 5. Z-Score Normalization (standardization)

In [17]:
# Normalize scores so each subjects has (mean=0 and std=1)
# Z = (X - mean) / std
z_scores = (scores - means) / std
print("\nNormalized first row:",np.round(z_scores[0], 2))


Normalized first row: [-0.84  0.61  0.49  0.31]


### 6. Gender-wise analysis

In [19]:
boys = scores[gender == 0]
girls = scores[gender == 1]

print("\nBoys average:", np.mean(boys, axis=0))
print("\nGirls average:", np.mean(girls, axis=0))


Boys average: [64.82674596 71.13832872 67.96113494 59.85033835]

Girls average: [64.81407826 71.45039301 69.30248773 59.71718059]


### 7. Correlation blw subjects

In [20]:
corr_matrix = np.corrcoef(scores.T)
print("\nCorrelation matrix blw subjects:\n", np.round(corr_matrix, 2))


Correlation matrix blw subjects:
 [[ 1.   -0.01 -0.05  0.07]
 [-0.01  1.   -0.01 -0.03]
 [-0.05 -0.01  1.    0.01]
 [ 0.07 -0.03  0.01  1.  ]]


### 8. Linear Algebra - find weights for total marks prediction

In [24]:
# let total marks = 0.4*Math + 0.3*Science + 0.2*English + 0.1*Sports
# Let's verify with matrix multiplication
weights = np.array([0.4, 0.3, 0.2, 0.1]) # importance of each subjet
predicted_total = scores @ weights # matrix multiplication, # @ means multiply blw rows & weights
print("\nPredicted total marks sample:" , predicted_total[:5]) # predicted marks


Predicted total marks sample: [65.74537352 56.65576572 64.00557071 72.29412175 64.53259379]


### 9. Boolean Masking (filtering)

In [28]:
# Find students who scored > 85 in both Math & Science
mask = (math > 85) & (science > 85)
high_achievers = np.sum(mask)
print("\nStudents with >85 in both Science & Math:", high_achievers)
# Find students who scored > 80 in Math
math_top = (math>80)
math_topper = np.sum(math_top)
print("\nStudents with >80 in Maths:", math_topper)



Students with >85 in both Science & Math: 6

Students with >80 in Maths: 76


### 10. Practice task

### Lowest 10% students ka average nikalna

In [32]:

# Step 1: Find out Total marks 
total = scores.sum(axis=1)
# Step 2: Sort total marks
sorted_total = np.sort(total)
# Step 3: 10% student ka count
n_students = len(total)
lowest_count = int(0.10 * n_students)
# Step 4: Take Lowest 10% marks data 
lowest_10 = sorted_total[:lowest_count]
# Step 5: Avg of lowest 10%
lowest_avg = np.mean(lowest_10per)
# print result
print("\nLowest 10% students marks avg:\n", lowest_avg)


Lowest 10% students marks avg:
 210.71423324610964


### Top 10% aur bottom 10% ke marks compare karna

In [33]:
# Total marks per student
total = scores.sum(axis=1)
# Sort marks (ascending order)
sorted_total = np.sort(total)
# Calculate how many students are in 10%
n_students = len(total)
count_10 = int(0.10 * n_students)
# Lowest 10% and highest 10%
bottom_10 = sorted_total[:count_10]
top_10 = sorted_total[-count_10:]
# Find their averages
bottom_avg = np.mean(bottom_10)
top_avg = np.mean(top_10)

# Compare the two groups
print("Bottom 10% average marks:", round(bottom_avg, 2))
print("Top 10% average marks:", round(top_avg, 2))
print("Difference (Top - Bottom):", round(top_avg - bottom_avg, 2))

Bottom 10% average marks: 210.71
Top 10% average marks: 312.67
Difference (Top - Bottom): 101.96


### Z-Score Normalize karne ke baad ka average

In [40]:
# Z_score nikal lo
z_scores = (scores - means) / std
# Do mean check of every subject 
avg_after = np.mean(z_scores, axis=0)
print("Average (mean) after normalization:", np.round(avg_after, 2))

Average (mean) after normalization: [ 0. -0. -0. -0.]


### Student Pass Percentage (total >= 200)

In [48]:
# Step 1: find total marks
total = scores.sum(axis=1)

# Step 2: Find passing students 
passed = total >= 200

# Step 3: Count passed students
n_passed = np.sum(passed)

# Step 4: Total students count
n_total = len(total)

# Step 5: Pass % 
pass_percent = (n_passed / n_total) * 100

# Step 6: print result
print("Total Students:", n_total)
print("Passed Students:", n_passed)
print("Pass percentage:", round(pass_percent, 2),"%")


Total Students: 500
Passed Students: 493
Pass percentage: 98.6 %


### Random missing values fill by mean

In [58]:
# Step 1: dataset shape
print(scores.shape)

# Step 2: Introduce random missing values
import numpy as np
np.random.seed(42) # repeatable randomness

# Randomly choose 50 random (row,column) positions to make missing
n_missing = 50
rows = np.random.randint(0, scores.shape[0], n_missing) # Start, Stop, Size
cols = np.random.randint(0, scores.shape[1], n_missing)

# Replace those positions with NaN
scores_with_nan = scores.copy()
scores_with_nan[rows, cols] = np.nan

print("Random missing values added Successfully!")

# Step 3: Check Missing Values
print("Total missing Values:", np.isnan(scores_with_nan).sum())

# Step 4: Calculate mean of each column(NaN ignore karke)
col_means = np.nanmean(scores_with_nan, axis=0) # nanmean NaN values ko ignore karta h
print("Column means:", np.round(col_means, 2))

# Step 5: Missing values fill by mean
# find positions where values are NaN
inds = np.where(np.isnan(scores_with_nan))

# Replace NaN with mean of that column
scores_with_nan[inds] = np.take(col_means, inds[1])
print("Missing values filled with column means!")

# Step 6: Check again
print("Total missing values after filling:", np.isnan(scores_with_nan).sum())

(500, 4)
Random missing values added Successfully!
Total missing Values: 48
Column means: [64.69 71.37 68.63 59.82]
Missing values filled with column means!
Total missing values after filling: 0
