# Libraries

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Data

In [50]:
data = pd.read_csv('xAPI-Edu-Data.csv')
data

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,F,Jordan,Jordan,MiddleSchool,G-08,A,Chemistry,S,Father,5,4,5,8,No,Bad,Above-7,L
476,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,F,Father,50,77,14,28,No,Bad,Under-7,M
477,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,S,Father,55,74,25,29,No,Bad,Under-7,M
478,F,Jordan,Jordan,MiddleSchool,G-08,A,History,F,Father,30,17,14,57,No,Bad,Above-7,L


In [51]:
data.columns

Index(['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID',
       'SectionID', 'Topic', 'Semester', 'Relation', 'raisedhands',
       'VisITedResources', 'AnnouncementsView', 'Discussion',
       'ParentAnsweringSurvey', 'ParentschoolSatisfaction',
       'StudentAbsenceDays', 'Class'],
      dtype='object')

## Statistical Summary

### Numerical Summary

In [52]:
numerical = data.select_dtypes(include=['number'])
numerical.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
raisedhands,480.0,46.775,30.779223,0.0,15.75,50.0,75.0,100.0
VisITedResources,480.0,54.797917,33.080007,0.0,20.0,65.0,84.0,99.0
AnnouncementsView,480.0,37.91875,26.611244,0.0,14.0,33.0,58.0,98.0
Discussion,480.0,43.283333,27.637735,1.0,20.0,39.0,70.0,99.0


### Categorical Summary

In [53]:
categorical = data.select_dtypes(include=['category', 'object'])
categorical.describe().T

Unnamed: 0,count,unique,top,freq
gender,480,2,M,305
NationalITy,480,14,KW,179
PlaceofBirth,480,14,KuwaIT,180
StageID,480,3,MiddleSchool,248
GradeID,480,10,G-02,147
SectionID,480,3,A,283
Topic,480,12,IT,95
Semester,480,2,F,245
Relation,480,2,Father,283
ParentAnsweringSurvey,480,2,Yes,270


## Data Entrance

In [54]:
data.isnull().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

# Exploratory Data Analaysis (EDA)

## Possible Question for these Features

### Demographic Insight
1. What is the distribution of students by gender?
2. What is the distribution of students by nationality?
3. What is the relationship between Place of Birth and Nationality

### Academic Performance
1. How does academc performance vary across different stages (StageID)?
2. How does grade level (GradeID) impact academic peformance (Class)?
3. How do engagement metrics like raised hands, visited resources, and announcement viewed relate to academic performance?
4. How do topics (Topic) affect academic performance and engagement?

### Parental Influence
1. How does parental involvement (ParentAnsweringSurvey) relate to sutdent performance?
2. What is the relationship between parental satisfaction with the school?

### Attendance and Absense
1. How do student absence (StudentAbsenceDays) impact academic performance?
2. How does student attendance (SectionID) relate to engagement metrics like raised hands and visited resources?

### Semester Performance
1. How does academic performance vary between semesters?

## Target Distribution

In [55]:
pie = px.pie(data,
            'Class', 
            title = 'Class Distribution (Piechart)',
            labels= {'Class': 'Class Label'},
            color_discrete_sequence=px.colors.qualitative.Plotly,
            hover_data=['Class'],)

pie.update_traces(textinfo = 'percent+label',
                  pull = [0.1 if cls == 'SpecificClass' else 0 for cls in data['Class']],
                  marker = dict(line=dict(color = '#000000', width = 2)))

pie.update_layout(title_font_size = 24,
                  legend_title_text = 'Class Categories',
                  margin = dict(t = 50, b = 50, l = 50, r = 50),
                  font = dict(family = 'Atial', size = 14, color = 'darkblue'))


pie.show()

## Demographic Insight

In [56]:
gender_count = data['gender'].value_counts().reset_index()
gender_count.columns = ['gender', 'count']

student_nationality = data['NationalITy'].value_counts().reset_index()
student_nationality.columns = ['country', 'count']

nationality_birth = data.groupby(['PlaceofBirth', 'NationalITy']).size().reset_index(name = 'Count')


gender_chart = px.bar(gender_count, x = 'gender', y = 'count',
                      title = 'Gender Distribution',
                      labels = {'gender': 'Gender', 'count': 'Count'},
                      color = 'gender', text = 'count')

nationality = px.bar(student_nationality, x = 'country', y = 'count',
                     title = 'Nationality Distribution',
                     labels = {'country': 'Country', 'count': 'Country'},
                     color = 'country', text = 'count')

nationality_birth_bar = px.bar(nationality_birth, x = 'PlaceofBirth', y = 'Count', color = 'NationalITy',
                               title = 'Place of Birth vs Nationality', 
                               labels = {'Count': 'Number of Individuals', 'PlaceofBirth': 'Place of Birth'},
                               barmode = 'stack')

gender_chart.show()
nationality.show()
nationality_birth_bar.show()

## Academic Performance

In [59]:
perform_stages = data.groupby(['StageID', 'Class']).size().reset_index(name = 'Count')
grade_performance = data.groupby(['GradeID', 'Class']).size().reset_index(name = 'Count')


perform_stages_bar = px.bar(perform_stages, x = 'StageID', y = 'Count', color = 'Class',
                            title = 'Stage vs Performance', labels = {'Count': 'Number of Individuals', 'StageID': 'Stages'},
                            barmode = 'stack')

grade_performance_bar = px.bar(grade_performance, x='GradeID', y='Count', color='Class',
             title='Distribution of Class Categories by Grade Level',
             labels={'Count': 'Number of Students', 'GradeID': 'Grade Level'},
             barmode='stack')


perform_stages_bar.show()
grade_performance_bar.show()