In [1]:
import pandas as pd # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.preprocessing import OneHotEncoder, StandardScaler # type: ignore
from sklearn.linear_model import LinearRegression # type: ignore
from sklearn.metrics import mean_squared_error, r2_score # type: ignore

# Load datasets
parents = pd.read_csv('./data/raw/parents_info.csv')
performance = pd.read_csv('./data/raw/performance_records.csv')

# Merge parents data with performance
data_parents = pd.merge(performance, parents, on='Student_ID')

# Drop irrelevant columns (if necessary)
data_parents.drop(['Parent_ID'], axis=1, inplace=True)

# Handle missing values (fill or drop)
data_parents.fillna(data_parents.mean(), inplace=True)

# One-hot encode categorical variables
categorical_columns = ['Parents_Education_Level', 'Father_Occupation', 'Mother_Occupation', 'Parental_Support']
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(data_parents[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(categorical_columns))

# Combine the encoded data with the rest of the dataset
data_parents = pd.concat([data_parents.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Features and Target
X_parents = data_parents.drop(['Score', 'Grade', 'Teacher_Comments', 'Exam_Type'], axis=1)
y_parents = data_parents['Score']

# Split data
X_train_parents, X_test_parents, y_train_parents, y_test_parents = train_test_split(X_parents, y_parents, test_size=0.2, random_state=42)

# Train Linear Regression model
model_parents = LinearRegression()
model_parents.fit(X_train_parents, y_train_parents)

# Predict
y_pred_parents = model_parents.predict(X_test_parents)

# Evaluate model
print('Parents Model - Mean Squared Error:', mean_squared_error(y_test_parents, y_pred_parents))
print('Parents Model - R^2 Score:', r2_score(y_test_parents, y_pred_parents))

FileNotFoundError: [Errno 2] No such file or directory: './data/raw/parents_info.csv'

In [None]:
# Load extra-curricular activities data
extra_cur = pd.read_csv('./data/raw/Extra_Curricular.csv')

# Merge extra_curricular data with performance
data_extra_cur = pd.merge(performance, extra_cur, on='Student_ID')

# One-hot encode categorical variables
categorical_columns = ['Activity_Name', 'Role', 'Impact_on_Academics']
encoded_data = encoder.fit_transform(data_extra_cur[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(categorical_columns))

# Combine the encoded data with the rest of the dataset
data_extra_cur = pd.concat([data_extra_cur.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Features and Target
X_extra_cur = data_extra_cur.drop(['Score', 'Grade', 'Teacher_Comments', 'Exam_Type'], axis=1)
y_extra_cur = data_extra_cur['Score']

# Split data
X_train_extra_cur, X_test_extra_cur, y_train_extra_cur, y_test_extra_cur = train_test_split(X_extra_cur, y_extra_cur, test_size=0.2, random_state=42)

# Train model
model_extra_cur = LinearRegression()
model_extra_cur.fit(X_train_extra_cur, y_train_extra_cur)

# Predict and evaluate
y_pred_extra_cur = model_extra_cur.predict(X_test_extra_cur)
print('Extracurricular Model - Mean Squared Error:', mean_squared_error(y_test_extra_cur, y_pred_extra_cur))
print('Extracurricular Model - R^2 Score:', r2_score(y_test_extra_cur, y_pred_extra_cur))

In [None]:
# Load teacher data
teachers = pd.read_csv('./data/raw/Teacher_Info.csv')

# Merge teachers data with performance
data_teachers = pd.merge(performance, teachers, on='Student_ID')

# One-hot encode categorical variables
categorical_columns = ['Qualification']
encoded_data = encoder.fit_transform(data_teachers[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(categorical_columns))

# Combine the encoded data with the rest of the dataset
data_teachers = pd.concat([data_teachers.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Features and Target
X_teachers = data_teachers.drop(['Score', 'Grade', 'Teacher_Comments', 'Exam_Type'], axis=1)
y_teachers = data_teachers['Score']

# Split data
X_train_teachers, X_test_teachers, y_train_teachers, y_test_teachers = train_test_split(X_teachers, y_teachers, test_size=0.2, random_state=42)

# Train model
model_teachers = LinearRegression()
model_teachers.fit(X_train_teachers, y_train_teachers)

# Predict and evaluate
y_pred_teachers = model_teachers.predict(X_test_teachers)
print('Teachers Model - Mean Squared Error:', mean_squared_error(y_test_teachers, y_pred_teachers))
print('Teachers Model - R^2 Score:', r2_score(y_test_teachers, y_pred_teachers))

In [None]:
# Load attendance data
attendance = pd.read_csv('./data/raw/Attendance_Records.csv')

# Merge attendance data with performance
data_attendance = pd.merge(performance, attendance, on='Student_ID')

# One-hot encode categorical variables
categorical_columns = ['Status', 'Reason_for_Absence']
encoded_data = encoder.fit_transform(data_attendance[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(categorical_columns))

# Combine the encoded data with the rest of the dataset
data_attendance = pd.concat([data_attendance.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Features and Target
X_attendance = data_attendance.drop(['Score', 'Grade', 'Teacher_Comments', 'Exam_Type'], axis=1)
y_attendance = data_attendance['Score']

# Split data
X_train_attendance, X_test_attendance, y_train_attendance, y_test_attendance = train_test_split(X_attendance, y_attendance, test_size=0.2, random_state=42)

# Train model
model_attendance = LinearRegression()
model_attendance.fit(X_train_attendance, y_train_attendance)

# Predict and evaluate
y_pred_attendance = model_attendance.predict(X_test_attendance)
print('Attendance Model - Mean Squared Error:', mean_squared_error(y_test_attendance, y_pred_attendance))
print('Attendance Model - R^2 Score:', r2_score(y_test_attendance, y_pred_attendance))

In [None]:
# Merge all datasets with performance
data_combined = pd.merge(performance, parents, on='Student_ID')
data_combined = pd.merge(data_combined, extra_cur, on='Student_ID')
data_combined = pd.merge(data_combined, teachers, on='Student_ID')
data_combined = pd.merge(data_combined, attendance, on='Student_ID')

# One-hot encode categorical variables
categorical_columns = ['Parents_Education_Level', 'Father_Occupation', 'Mother_Occupation', 'Parental_Support', 'Activity_Name', 'Role', 'Impact_on_Academics', 'Qualification', 'Status', 'Reason_for_Absence']
encoded_data = encoder.fit_transform(data_combined[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(categorical_columns))

# Combine the encoded data with the rest of the dataset
data_combined = pd.concat([data_combined.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Features and Target
X_combined = data_combined.drop(['Score', 'Grade', 'Teacher_Comments', 'Exam_Type'], axis=1)
y_combined = data_combined['Score']

# Split data
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Train model
model_combined = LinearRegression()
model_combined.fit(X_train_combined, y_train_combined)

# Predict and evaluate
y_pred_combined = model_combined.predict(X_test_combined)
print('Combined Model - Mean Squared Error:', mean_squared_error(y_test_combined, y_pred_combined))
print('Combined Model - R^2 Score:', r2_score(y_test_combined, y_pred_combined))