# Student Inferencia Analysis

In [39]:
# Libraries
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, f_oneway, chi2_contingency, pearsonr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Dataset
I going to analize [student_performance_data.csv](https://www.kaggle.com/datasets/waqi786/student-performance-dataset)

In [3]:
#load the data
df = pd.read_csv('student_performance_data.csv')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  500 non-null    int64  
 1   Gender                     500 non-null    object 
 2   Age                        500 non-null    int64  
 3   StudyHoursPerWeek          500 non-null    int64  
 4   AttendanceRate             500 non-null    float64
 5   GPA                        500 non-null    float64
 6   Major                      500 non-null    object 
 7   PartTimeJob                500 non-null    object 
 8   ExtraCurricularActivities  500 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 35.3+ KB
None


In [4]:
# a few registers to see the data
print(df.head(5))
df.set_index('StudentID')

   StudentID  Gender  Age  StudyHoursPerWeek  AttendanceRate   GPA      Major  \
0          1    Male   24                 37           90.75  3.47       Arts   
1          2  Female   22                 37           74.90  2.32  Education   
2          3    Male   22                 10           53.36  2.38   Business   
3          4    Male   24                 10           70.26  3.46    Science   
4          5    Male   18                 19           74.87  2.31  Education   

  PartTimeJob ExtraCurricularActivities  
0         Yes                        No  
1          No                        No  
2          No                        No  
3         Yes                        No  
4         Yes                        No  


Unnamed: 0_level_0,Gender,Age,StudyHoursPerWeek,AttendanceRate,GPA,Major,PartTimeJob,ExtraCurricularActivities
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Male,24,37,90.75,3.47,Arts,Yes,No
2,Female,22,37,74.90,2.32,Education,No,No
3,Male,22,10,53.36,2.38,Business,No,No
4,Male,24,10,70.26,3.46,Science,Yes,No
5,Male,18,19,74.87,2.31,Education,Yes,No
...,...,...,...,...,...,...,...,...
496,Male,22,37,76.61,2.97,Science,No,No
497,Male,23,11,56.29,3.20,Science,No,No
498,Female,20,6,56.64,3.20,Science,No,Yes
499,Male,22,18,57.18,2.05,Business,No,Yes


## 2. Hypothesis
1. Students with part-time jobs have different GPA than those without (t-test)
2. GPA differs across different majors (ANOVA)
3. There's an association between gender and having a part-time job (Chi Square)
4. Study hours per week is positively correlated with GPA (Correlation)

In [5]:
# 1. t-test
gpa_with_job = df[df['PartTimeJob']=='Yes']['GPA']
gpa_without_job = df[df['PartTimeJob']=='No']['GPA']

t_stat, p_value_ttest = ttest_ind(gpa_with_job,gpa_without_job, equal_var=False)

In [13]:
# 2. ANOVA
majors = df['Major'].unique()
gpa_by_major = [group['GPA'].values for name, group in df.groupby('Major')]

f_stat, p_value_anova = f_oneway(*gpa_by_major)

In [10]:
# 3. Chi square
contingence_table = pd.crosstab(df['Gender'], df['PartTimeJob'])
chi2, p_value_chiSquare, dof, expected = chi2_contingency(contingence_table)

In [11]:
# 4. Correlation
correlation, p_value_correlation = pearsonr(df['StudyHoursPerWeek'],df['GPA'])

In [38]:
#output
print("-"*60)
## HYPOTHESIS 1: Students with part-time jobs have different GPA
print(f"HYPOTHESIS 1: Students with part-time jobs have different GPA\n T-test results:\n statistics = {t_stat:.3f}\n p_value = {p_value_ttest:.4f}\n",
      f"Mean GPA with job: {gpa_with_job.mean():.3f}\n Mean GPA without job: {gpa_without_job.mean():.3f}\n")
if p_value_ttest < 0.05:
    print("✅ Significant difference in GPA based on part-time job status")
else:
    print("❌ No significant difference in GPA based on part-time job status\n")
print("-"*60)    
## HYPOTHESIS 2: Major affects GPA
print(f"HYPOTHESIS 2: HYPOTHESIS 2: GPA differs across majors\n ANOVA results:\n F-statistic = {f_stat:.3f}\n p_value = {p_value_anova:.4f}\n")
major_gpa_mean=df.groupby('Major')['GPA'].mean()
print("GPA mean by Major:")
for major,major_mean in major_gpa_mean.items():
    print(f"{major}: {major_mean:.3f}")
print()
if p_value_anova < 0.05:
    print("✅ Significant differences in GPA across majors")
else:
    print("❌ No significant differences in GPA across majors\n")
print("-"*60)
## HYPOTHESIS 3: Association between gender and part-time job
print(f"HYPOTHESIS 3: Association between gender and part-time job\n contingency table:\n",
      contingence_table,f"\n Chi-square results:\n χ² = {chi2:.3f}\n p_value = {p_value_chiSquare:.4f}\n df = {dof}\n")
if p_value_chiSquare < 0.05:
    print("✅ Significant association between gender and part-time job")
else:
    print("❌ No significant association between gender and part-time job")
print("-"*50)
## HYPOTHESIS 4: Correlation between study hours and GPA
print(f"HYPOTHESIS 4: Correlation between study hours and GPA\n Pearson correlation:\n r = {correlation:.3f}\n p_value = {p_value_correlation}\n")    
if p_value_correlation < 0.05:
    if correlation > 0:
        print("✅ Significant positive correlation between study hours and GPA")
    else:
        print("✅ Significant negative correlation between study hours and GPA")
else:
    print("❌ No significant correlation between study hours and GPA")

------------------------------------------------------------
HYPOTHESIS 1: Students with part-time jobs have different GPA
 T-test results:
 statistics = 0.909
 p_value = 0.3636
 Mean GPA with job: 3.006
 Mean GPA without job: 2.961

❌ No significant difference in GPA based on part-time job status

------------------------------------------------------------
HYPOTHESIS 2: HYPOTHESIS 2: GPA differs across majors
 ANOVA results:
 F-statistic = 0.490
 p_value = 0.7434

GPA mean by Major:
Arts: 3.007
Business: 3.036
Education: 2.939
Engineering: 2.973
Science: 2.958

❌ No significant differences in GPA across majors

------------------------------------------------------------
HYPOTHESIS 3: Association between gender and part-time job
 contingency table:
 PartTimeJob   No  Yes
Gender               
Female       120  136
Male         112  132 
 Chi-square results:
 χ² = 0.016
 p_value = 0.8978
 df = 1

❌ No significant association between gender and part-time job
---------------------------

## 3. Correlation and Regression
* Correlation between Study hours per week, Attendance rate and GPA
* Simple linear regression GPA ~ Study hours per week
* Multiple linear regression GPA ~ Study hours per week + attendance rate

In [None]:
# Correlation and Regression
## Correlation between Study hours per week, Attendace rate and GPA
correlation_matrix = df[['StudyHoursPerWeek','AttendanceRate','GPA']].corr(method='pearson')
print(f"Correlation Matrix:\n {correlation_matrix}\n")

Correlation Matrix:
                    StudyHoursPerWeek  AttendanceRate       GPA
StudyHoursPerWeek           1.000000        0.095716  0.091700
AttendanceRate              0.095716        1.000000  0.060828
GPA                         0.091700        0.060828  1.000000



In [None]:
## Simple linear regression GPA ~ Study Hours per Week
X_simple = df[["StudyHoursPerWeek"]]
y = df['GPA']
simpe_linear_model = LinearRegression().fit(X_simple,y)
print(f"Intercept: {simpe_linear_model.intercept_}\n Coeficient: {simpe_linear_model.coef_[0]}")

Intercept: 2.8956088750309306
 Coeficient: 0.00450549028824059


In [50]:
## Multiple linear regression GPA ~ Study Hours per week + Attendance rate
X_multiple = df[['StudyHoursPerWeek','AttendanceRate']]
multiple_linear_model = LinearRegression().fit(X_multiple,y)
print(f"Intercept: {multiple_linear_model.intercept_}\n Coeficients: {multiple_linear_model.coef_}")

Intercept: 2.748085958377937
 Coeficients: [0.00425844 0.0020327 ]


## 5. Conclusions
1. there is not influence in the GPA if a student has or hasn't a part time job.
2. the majors don't affect the GPA in the students.
3. there is no relation between student gender and if has part time job.
4. there is significant positive correlation between study hours and GPA
5. all correlations are positive but very weak. this suggests that while there might be slight tendecies, these variables don't strongly predict each other in isolation.
6. if a student studies 0 hours per week, ther predicted GPA would be 2.90 and for each additional hour studied per week, GPA increases by 0.0045 points.
7. if a student studies 0 hours per week and zero attendance rate, the GPA would be 2.7481 and each additional study hour increases by 0.0043, controlling for attendance and each percentage point increase in attendance increases GPA by 0.0020 points, controlling for study hours.