##### Import Libraries

In [18]:
import pandas as pd
import plotly.express as px
import numpy as np
import statsmodels
import nbformat

#### Function to calculate Overall Rating

In [19]:
def calculate_overall(row):
    it1_percentage = (row['internal_test1'] / 40) * 100
    it2_percentage = (row['internal_test2'] / 40) * 100
    assignment_percentage = (row['assignment_score'] / 10) * 100
    final_exam_score = row['final_exam_marks']
    overall = (0.20 * it1_percentage) + (0.20 * it2_percentage) + (0.10 * assignment_percentage) + (0.50 * final_exam_score)
    return overall

#### Load Dataset

In [20]:
students_data = pd.read_csv('../data/Final_Marks_Data.csv')
st2 = pd.read_csv('../data/Final_Marks_Data.csv') #used for dataframe display

#### Changing Column Names for better readability

In [21]:
new_columns = [{'Student_ID':'student_id',
                'Attendance (%)': 'attendance',
                'Internal Test 1 (out of 40)':'internal_test1',
                'Internal Test 2 (out of 40)':'internal_test2',
                'Assignment Score (out of 10)':'assignment_score',
                'Daily Study Hours':'daily_study_hours',
                'Final Exam Marks (out of 100)':'final_exam_marks'}]
students_data.rename(columns=new_columns[0], inplace=True)

#### Overall Marks

In [22]:
students_data['overall'] = students_data.apply(calculate_overall, axis=1)

#### Jitter Column (Aesthetic)

In [23]:
students_data['jitter'] = np.random.uniform(-0.05, 0.05, size=len(students_data))
students_data['study_hours_jitter'] = students_data['daily_study_hours'] + students_data['jitter']
students_data.drop(columns=['jitter'], inplace=True)
ticks = sorted(students_data['daily_study_hours'].unique())

#### Calculate if Pass/Fail

In [24]:
fail_final_exam = 0
fail_attendance = 0
fail_overall = 0
for idx, row in students_data.iterrows():
    if (row['final_exam_marks'] >= 50) and (row['attendance'] >= 75) and (row['overall'] >= 60):
        students_data.loc[idx, 'status'] = 'Passed'
    else:
        students_data.loc[idx, 'status'] = 'Failed'
    if (row['final_exam_marks'] < 50):
        fail_final_exam += 1
    if (row['attendance'] < 75):  
        fail_attendance += 1
    if (row['overall'] < 60):
        fail_overall += 1

#### Print Dataframe

In [25]:
students_data

Unnamed: 0,student_id,attendance,internal_test1,internal_test2,assignment_score,daily_study_hours,final_exam_marks,overall,study_hours_jitter,status
0,S1000,84,30,36,7,3,72,76.0,2.977897,Passed
1,S1001,91,24,38,6,3,56,65.0,2.996218,Passed
2,S1002,73,29,26,7,3,56,62.5,2.967128,Failed
3,S1003,80,36,35,7,3,74,79.5,3.037952,Passed
4,S1004,84,31,37,8,3,66,75.0,3.039626,Passed
...,...,...,...,...,...,...,...,...,...,...
1995,S2995,82,31,28,6,2,52,61.5,2.041487,Passed
1996,S2996,78,38,27,7,2,57,68.0,2.044769,Passed
1997,S2997,78,30,33,9,2,61,71.0,2.003122,Passed
1998,S2998,82,29,40,8,3,59,72.0,3.033873,Passed


In [26]:
students_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   student_id          2000 non-null   object 
 1   attendance          2000 non-null   int64  
 2   internal_test1      2000 non-null   int64  
 3   internal_test2      2000 non-null   int64  
 4   assignment_score    2000 non-null   int64  
 5   daily_study_hours   2000 non-null   int64  
 6   final_exam_marks    2000 non-null   int64  
 7   overall             2000 non-null   float64
 8   study_hours_jitter  2000 non-null   float64
 9   status              2000 non-null   object 
dtypes: float64(2), int64(6), object(2)
memory usage: 156.4+ KB


In [27]:
summary = st2.describe()
summary = summary.drop(['count','25%','75%'])
summary = summary.T
summary_columns = [{'mean':'Mean', 'std':'Std Dev', 'min':'Min', 'max':'Max','50%':'Median'}]
summary.rename(columns=summary_columns[0], inplace=True)
summary

Unnamed: 0,Mean,Std Dev,Min,Median,Max
Attendance (%),84.8915,7.758855,52.0,85.0,100.0
Internal Test 1 (out of 40),32.1155,4.563504,18.0,32.0,40.0
Internal Test 2 (out of 40),32.4645,4.522827,16.0,33.0,40.0
Assignment Score (out of 10),7.507,1.021015,4.0,8.0,10.0
Daily Study Hours,2.8235,0.608714,1.0,3.0,5.0
Final Exam Marks (out of 100),64.855,11.341265,25.0,65.0,100.0


#### Histograms

In [28]:
fig1 = px.histogram(students_data,
                        x='internal_test1',
                        nbins=40,
                        title='Marks Distribution Histogram (Internal Test 1)',
                        labels={'internal_test1':'Marks Obtained'},
                        color_discrete_sequence=['lightblue'],
                        width=1000)
fig1.show()

In [29]:
fig2 = px.histogram(students_data,
                        x='internal_test2',
                        nbins=40,
                        title='Marks Distribution Histogram (Internal Test 2)',
                        labels={'internal_test2':'Marks Obtained'},
                        color_discrete_sequence=['orange'],
                        width=1000)
fig2.show()

In [30]:
# making a new dataframe for comparison
grade_comparsion = pd.melt(students_data, value_vars=['internal_test1', 'internal_test2'])
grade_comparsion.replace({'internal_test1':'Internal Test 1', 'internal_test2':'Internal Test 2'}, inplace=True)

fig3 = px.histogram(grade_comparsion,
                        opacity=0.7,
                        x='value',
                        labels={'value':'Marks Obtained',
                                'variable':'Exams',
                                'internal_test2':'Internal Test 2'},
                        title='Comparison of Internal Test 1 and Internal Test 2 Marks',
                        color='variable',
                        nbins=40,
                        barmode='overlay',
                        color_discrete_sequence=['lightblue', 'orange'],
                        width=1000)
    
# updating legend position - inside plot area
fig3.update_layout(legend= {'x':0.95,
                            'y':0.95,
                            'xanchor':'right',
                            'yanchor':'top',
                            'bgcolor':'rgba(0,0,0,0)'})
fig3.show()

#### Distribution of Marks

This bar chart shows the number of students who passed and failed based on a passing criteria shown below:

Passing Criteria:</br>
</br>
Final Exam Marks ≥ 50</br>
Attendance ≥ 75%</br>
Overall ≥ 60%</br>
<div align='center'> Overall Formula: </div>
<div align='center'> Overall = 0.20*IT1% + 0.20*IT2% + 0.10*Assign% + 0.50*Final%</div>

    Students meeting the criteria are considered "Passed", while those who do not are considered "Failed".


In [31]:
fig4 = px.bar(x=['Passed','Failed'],
              y=[len(students_data[students_data['status']=='Passed']),
                 len(students_data[students_data['status']=='Failed'])],
              color=['Passed','Failed'],
              labels={'x':'Status', 'y':'Number of Students','color':'Status'},
              title='Number of Students Passed vs Failed',
              color_discrete_sequence=['lightblue','red'])
fig4.show()

#### Failure Reasons
##### Note: A student can fail for multiple reasons, so the sum of failure reasons may exceed the total number of failed students.

In [32]:
print(f'- Number of students who failed due to Final Exam Marks < 50: {fail_final_exam}')
print(f'- Number of students who failed due to Attendance < 75%: {fail_attendance}')
print(f'- Number of students who failed due to Overall < 60%: {fail_overall}')


- Number of students who failed due to Final Exam Marks < 50: 188
- Number of students who failed due to Attendance < 75%: 183
- Number of students who failed due to Overall < 60%: 219


#### Scatter Plot

In [33]:
fig5 = px.scatter(students_data,
                  x='study_hours_jitter',
                  y='overall',
                  color='status',
                  opacity=0.7,
                  title='Daily Study Hours vs Overall Marks',
                  labels={'study_hours_jitter':'Daily Study Hours', 'overall':'Overall Score'},
                  color_discrete_sequence=['lightblue','red'],
                  trendline='ols')
fig5.show()