In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates

# Load the cleaned data
df = pd.read_csv('../data/processed/cleaned_data.csv')

# 1. Parallel Coordinates Plot
numerical_cols = ['Hours_Studied', 'Exam_Score', 'Attendance']
plt.figure(figsize=(14, 8))
parallel_coordinates(df[numerical_cols + ['Gender']], class_column='Gender', colormap='viridis')
plt.title('Parallel Coordinates Plot')
plt.xlabel('Attributes')
plt.ylabel('Values')
plt.legend(loc='upper right')
plt.savefig('../reports/figures/parallel_coordinates.png')
plt.close()

# 2. Correlation Heatmap
numerical_data = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(16, 8))
correlation_matrix = numerical_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.savefig('../reports/figures/correlation_heatmap.png')
plt.close()
'''
Insight: Exam_Score is moderately correlated with Hours_Studied (~0.48) and Attendance (~0.48), 
indicating that students who study more and attend classes regularly tend to score higher.
'''

# 3. Bubble Plot for Pairwise Comparisons
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Hours_Studied', y='Exam_Score', size='Attendance', hue='Gender', 
                palette='viridis', sizes=(50, 500), data=df)
plt.title('Bubble Plot: Hours Studied vs. Exam Score')
plt.xlabel('Hours Studied')
plt.ylabel('Exam Score')
plt.legend(title='Gender')
plt.savefig('../reports/figures/bubble_plot.png')
plt.close()
'''
Insight: Females slightly outperform males at higher score levels, as indicated by the distribution of bubbles.
'''

# 4. IQR-based Box Plots
Q1 = df['Exam_Score'].quantile(0.25)
Q3 = df['Exam_Score'].quantile(0.75)
IQR = Q3 - Q1
filtered_df = df[~((df['Exam_Score'] < (Q1 - 1.5 * IQR)) | (df['Exam_Score'] > (Q3 + 1.5 * IQR)))]

# Box plot for Exam_Score vs. Attendance
plt.figure(figsize=(10, 6))
sns.boxplot(x='Exam_Score', y='Attendance', data=filtered_df)
plt.title('Exam Scores by Attendance')
plt.xlabel('Exam Score')
plt.ylabel('Attendance (%)')
plt.savefig('../reports/figures/boxplot_attendance.png')
plt.close()

# Box plot for Exam_Score vs. Hours_Studied
plt.figure(figsize=(10, 6))
sns.boxplot(x='Exam_Score', y='Hours_Studied', data=filtered_df)
plt.title('Exam Scores by Hours Studied')
plt.xlabel('Exam Score')
plt.ylabel('Hours Studied')
plt.savefig('../reports/figures/boxplot_hours_studied.png')
plt.close()
'''
Insight: Higher attendance and more study hours are associated with higher exam scores,
as shown by the increasing median scores in the box plots.
'''

# 5. Violin Plots
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.violinplot(x='Gender', y='Exam_Score', data=df, inner='quartile')
plt.title('Distribution of Exam Scores by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=14)
plt.ylabel('Exam Score', fontsize=14)
plt.savefig('../reports/figures/violin_plot.png')
plt.close()
'''
Insight: Females display greater variability in exam scores compared to males. 
Both genders have similar overall distributions, with females showing slightly higher scores at the upper end.
'''

Insight: Exam_Score is moderately correlated with Hours_Studied (~0.48) and Attendance (~0.48), indicating that students who study more and attend classes regularly tend to score higher.
Insight: Females slightly outperform males at higher score levels, as indicated by the distribution of bubbles.
Insight: Higher attendance and more study hours are associated with higher exam scores, as shown by the increasing median scores in the box plots.
Insight: Females display greater variability in exam scores compared to males. Both genders have similar overall distributions, with females showing slightly higher scores at the upper end.
