In [3]:
import pandas as pd

# Students table
students = pd.DataFrame({
    "ID": [1, 2, 3, 4, 5],
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"]
})

# Marks table
marks = pd.DataFrame({
    "student_id": [1, 2, 2, 3, 4, 5, 5],
    "Subject": ["Math", "Math", "English", "Math", "English", "Math", "English"],
    "Score": [85, 90, 78, 92, 88, 76, 80]
})

# Teachers table
teachers = pd.DataFrame({
    "Subject": ["Math", "English", "Science"],
    "Teacher": ["Mr. Smith", "Ms. Johnson", "Dr. Lee"]
})


In [3]:
# agg()- Summaries per group
# Show average, min score per subject

In [4]:
marks.groupby("Subject")["Score"].agg(
    avg="mean",
    top="max",
    low="min")


Unnamed: 0_level_0,avg,top,low
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
English,82.0,88,78
Math,85.75,92,76


In [5]:
# transform()- Add group info back to rows
# Add each student's subject average next to their score

In [6]:
marks["avg_subject_score"]= marks.groupby("Subject")["Score"].transform("mean")

In [9]:
marks

Unnamed: 0,student_id,Subject,Score,avg_subject_score
0,1,Math,85,85.75
1,2,Math,90,85.75
2,2,English,78,82.0
3,3,Math,92,85.75
4,4,English,88,82.0
5,5,Math,76,85.75
6,5,English,80,82.0


In [10]:
# apply() - Custom group logic
# Show only top 2 scorers per subject

In [11]:
marks.groupby("Subject").apply(lambda x: x.nlargest(2, "Score"))

  marks.groupby("Subject").apply(lambda x: x.nlargest(2, "Score"))


Unnamed: 0_level_0,Unnamed: 1_level_0,student_id,Subject,Score,avg_subject_score
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
English,4,4,English,88,82.0
English,6,5,English,80,82.0
Math,3,3,Math,92,85.75
Math,1,2,Math,90,85.75


In [15]:
marks.groupby("Subject", group_keys=True, include_groups=False).apply(lambda x: x.nlargest(2, "Score"))

TypeError: DataFrame.groupby() got an unexpected keyword argument 'include_groups'

In [18]:
marks.groupby("Subject", group_keys=True).apply(
    lambda x: x.nlargest(2, "Score")[["Subject", "student_id", "Score"]]
).reset_index(drop=True)

  marks.groupby("Subject", group_keys=True).apply(


Unnamed: 0,Subject,student_id,Score
0,English,4,88
1,English,5,80
2,Math,3,92
3,Math,2,90


In [19]:
#Aggregations and Advanced Grouping

In [20]:
import pandas as pd

students = pd.DataFrame({
    "ID": [1, 2, 3, 4, 5],
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "City": ["Delhi", "Mumbai", "Delhi", "Bangalore", "Mumbai"]
})

marks = pd.DataFrame({
    "student_id": [1, 2, 2, 3, 4, 5, 5],
    "Subject": ["Math", "Math", "Science", "Math", "Science", "Math", "Science"],
    "Score": [85, 90, 78, 92, 88, 76, 95]
})


In [21]:
#1) Find the average score per subject

In [22]:
marks.groupby("Subject")["Score"].mean()

Subject
Math       85.75
Science    87.00
Name: Score, dtype: float64

In [23]:
marks.groupby("Subject")["Score"].agg(["min","max"])

Unnamed: 0_level_0,min,max
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1
Math,76,92
Science,78,95


In [24]:
#3) For each city, find the average scores of its students

In [25]:
sc= pd.merge(students, marks, left_on="ID", right_on="student_id", how ="inner")

In [26]:
sc

Unnamed: 0,ID,Name,City,student_id,Subject,Score
0,1,Alice,Delhi,1,Math,85
1,2,Bob,Mumbai,2,Math,90
2,2,Bob,Mumbai,2,Science,78
3,3,Charlie,Delhi,3,Math,92
4,4,David,Bangalore,4,Science,88
5,5,Eva,Mumbai,5,Math,76
6,5,Eva,Mumbai,5,Science,95


In [28]:
sc.groupby("City")["Score"].mean()

City
Bangalore    88.00
Delhi        88.50
Mumbai       84.75
Name: Score, dtype: float64

In [29]:
# Find the student with highest score in each subject

In [32]:
marks.loc[marks.groupby("Subject")["Score"].idxmax()]

Unnamed: 0,student_id,Subject,Score
3,3,Math,92
6,5,Science,95


In [33]:
#Count how many subjects each students has marks in

In [34]:
marks.groupby("student_id")["Subject"].count()

student_id
1    1
2    2
3    1
4    1
5    2
Name: Subject, dtype: int64

In [35]:
# Mini Quiz

In [37]:
# Q1.

# Find the average score in each subject, and rename the result column as "Average_Score".

In [47]:
tm = marks.groupby("Subject")["Score"].mean().reset_index(name="Average_Score")


In [50]:
tm = marks.groupby("Subject")["Score"].agg(Average_Score="mean").reset_index()



In [51]:
tm

Unnamed: 0,Subject,Average_Score
0,Math,85.75
1,Science,87.0


In [52]:
#Q2) Find the highest score in each city

In [58]:
sm= pd.merge(students, marks, left_on="ID", right_on="student_id", how= "inner")
sm.groupby("City")["Score"].max().reset_index(name="Max_Score")

Unnamed: 0,City,Max_Score
0,Bangalore,88
1,Delhi,92
2,Mumbai,95


In [59]:
#Q3) List the students how scored the minimum marks in Science

In [70]:
sm.loc[sm[sm["Subject"] == "Science"]["Score"].idxmin(), "Name"]


'Bob'

In [71]:
# Get the student(s) with minimum score in Science
science_min = sm[sm["Subject"] == "Science"].nsmallest(1, "Score")
print(science_min)


   ID Name    City  student_id  Subject  Score
2   2  Bob  Mumbai           2  Science     78


In [72]:
# Filter Science students
science = sm[sm["Subject"] == "Science"]

# Find the minimum score in Science
min_score = science["Score"].min()

# Get all students with that score
science_min_all = science[science["Score"] == min_score]
print(science_min_all)


   ID Name    City  student_id  Subject  Score
2   2  Bob  Mumbai           2  Science     78


In [73]:
# Q4.

# Count how many students have scored marks in more than 1 subject.

In [90]:
m1 = marks.groupby("student_id")["Subject"].count().reset_index(name="Count")
m1= m1[m1["Count"]>1]
len(m1)

2

In [91]:
m1 = marks["student_id"].value_counts()
m1[m1 > 1].count()


np.int64(2)

In [92]:
marks.groupby("student_id")["Subject"].nunique().gt(1).sum()


np.int64(2)

In [94]:
# Q5.

# Show the top 2 scorers in each subject.

In [95]:
marks.groupby("Subject").apply(lambda x: x.nlargest(2, "Score"))

  marks.groupby("Subject").apply(lambda x: x.nlargest(2, "Score"))


Unnamed: 0_level_0,Unnamed: 1_level_0,student_id,Subject,Score
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Math,3,3,Math,92
Math,1,2,Math,90
Science,6,5,Science,95
Science,4,4,Science,88


In [96]:
marks.sort_values(["Subject", "Score"], ascending=[True, False]) \
     .groupby("Subject").head(2)


Unnamed: 0,student_id,Subject,Score
3,3,Math,92
1,2,Math,90
6,5,Science,95
4,4,Science,88


In [4]:
marks.groupby("Subject")["Score"].agg(
    avg_score="mean", 
    highest="max", 
    lowest="min"
)


Unnamed: 0_level_0,avg_score,highest,lowest
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
English,82.0,88,78
Math,85.75,92,76


In [5]:
marks.groupby("Subject").agg(
    avg_score=("Score", "mean"),     # mean of Score
    highest=("Score", "max"),        # max of Score
    lowest=("Score", "min"),         # min of Score
    student_count=("student_id", "count")   # count students
)


Unnamed: 0_level_0,avg_score,highest,lowest,student_count
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
English,82.0,88,78,3
Math,85.75,92,76,4


In [6]:
marks.groupby("Subject").agg(
    avg_score=("Score","mean"),
    highest=("Score","max"),
    lowest=("Score","min"),
    student_count=("student_id","count")
).sort_values("avg_score", ascending=False)

Unnamed: 0_level_0,avg_score,highest,lowest,student_count
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Math,85.75,92,76,4
English,82.0,88,78,3


In [7]:
# Let's merge the subject summary with the teachers table to see which teacher teaches which
#subject along with averages.

In [8]:
subject_summary= marks.groupby("Subject").agg(
    avg_score=("Score","mean"),
    highest=("Score","max"),
    lowest=("Score","min"),
    student_count=("student_id","count")
).reset_index()

In [9]:
subject_teacher_summary= pd.merge(subject_summary, teachers, on="Subject", how="left")

In [10]:
subject_teacher_summary

Unnamed: 0,Subject,avg_score,highest,lowest,student_count,Teacher
0,English,82.0,88,78,3,Ms. Johnson
1,Math,85.75,92,76,4,Mr. Smith


In [11]:
subject_teacher_summary.sort_values("avg_score", ascending=False)

Unnamed: 0,Subject,avg_score,highest,lowest,student_count,Teacher
1,Math,85.75,92,76,4,Mr. Smith
0,English,82.0,88,78,3,Ms. Johnson


In [14]:
weakest_subject = subject_teacher_summary.loc[
    [subject_teacher_summary["avg_score"].idxmin()]
    ]

In [15]:
weakest_subject

Unnamed: 0,Subject,avg_score,highest,lowest,student_count,Teacher
0,English,82.0,88,78,3,Ms. Johnson


In [16]:
#How to join this with the students table to list all students who scored below the subject
# average?

In [17]:
# #Steps
# 1) Find weakest_subject + its average
# 2) Extract the subject name and avg
# 3) Filter marks for that subject and below avg
# 4) Join Students to get names


In [18]:
weakest_subject = subject_teacher_summary.loc[
    [subject_teacher_summary["avg_score"].idxmin()]
]

weakest_subject


Unnamed: 0,Subject,avg_score,highest,lowest,student_count,Teacher
0,English,82.0,88,78,3,Ms. Johnson


In [26]:
sub= weakest_subject["Subject"].iloc[0]
avg= weakest_subject["avg_score"].iloc[0]

In [29]:
weak_scores= marks[(marks["Subject"] == sub) & (marks["Score"] < avg)]


In [30]:
weak_scores

Unnamed: 0,student_id,Subject,Score
2,2,English,78
6,5,English,80


In [35]:
weak_students= pd.merge(weak_scores, students, left_on="student_id", right_on="ID")
weak_students

Unnamed: 0,student_id,Subject,Score,ID,Name
0,2,English,78,2,Bob
1,5,English,80,5,Eva


In [36]:
weak_students = weak_students[["ID", "Subject", "Score", "Name"]]

In [38]:
weak_students

Unnamed: 0,ID,Subject,Score,Name
0,2,English,78,Bob
1,5,English,80,Eva


In [40]:
marks.groupby("Subject")["Score"].rank(method="dense", ascending=False)
marks


Unnamed: 0,student_id,Subject,Score
0,1,Math,85
1,2,Math,90
2,2,English,78
3,3,Math,92
4,4,English,88
5,5,Math,76
6,5,English,80


In [44]:
marks["Rank"] = marks.groupby("Subject")["Score"].rank(method="dense", ascending=False)

In [42]:
marks

Unnamed: 0,student_id,Subject,Score,Rank
0,1,Math,85,3.0
1,2,Math,90,2.0
2,2,English,78,3.0
3,3,Math,92,1.0
4,4,English,88,1.0
5,5,Math,76,4.0
6,5,English,80,2.0


In [45]:
import pandas as pd

data = {
    "Name": ["Amit", "Bina", "Chetan", "Divya", "Esha", "Farhan"],
    "Score": [95, 87, 95, 76, 87, 92]
}
df = pd.DataFrame(data)
print(df)

     Name  Score
0    Amit     95
1    Bina     87
2  Chetan     95
3   Divya     76
4    Esha     87
5  Farhan     92


In [54]:
df["Rank"] = df["Score"].rank(method="dense", ascending=False)
df.sort_values("Rank",ascending=True)

Unnamed: 0,Name,Score,Rank
0,Amit,95,1.0
2,Chetan,95,1.0
5,Farhan,92,2.0
1,Bina,87,3.0
4,Esha,87,3.0
3,Divya,76,4.0


In [55]:
df["Rank"] = df["Score"].rank(ascending=False)


In [56]:
df

Unnamed: 0,Name,Score,Rank
0,Amit,95,1.5
1,Bina,87,4.5
2,Chetan,95,1.5
3,Divya,76,6.0
4,Esha,87,4.5
5,Farhan,92,3.0


In [57]:
# Rank the student using method="first"


In [60]:
df["Rank"] = df["Score"].rank(method="min", ascending=False)
df


Unnamed: 0,Name,Score,Rank
0,Amit,95,1.0
1,Bina,87,4.0
2,Chetan,95,1.0
3,Divya,76,6.0
4,Esha,87,4.0
5,Farhan,92,3.0


In [65]:
df["Rank"] = df["Score"].rank(method="max", ascending=False)
print(df)

     Name  Score  Rank
0    Amit     95   2.0
1    Bina     87   5.0
2  Chetan     95   2.0
3   Divya     76   6.0
4    Esha     87   5.0
5  Farhan     92   3.0
