In [15]:
import numpy as np
import pandas as pd
import pickle

# Set random seed for reproducibility
np.random.seed(42)

# Number of students
num_students = 1000

In [16]:

# Simulating features
attendance = np.random.uniform(50, 100, num_students)  # Attendance percentage between 50 and 100
study_hours = np.random.uniform(2, 20, num_students)  # Study hours per week between 2 and 20
past_cgpa = np.random.uniform(2.0, 4.0, num_students)  # CGPA between 2.0 and 4.0
quiz_performance = np.random.uniform(50, 100, num_students)  # Quiz performance between 50 and 100
class_participation = np.random.uniform(1, 10, num_students)  # Class participation score between 1 and 10

In [17]:


# Simulating Final GPA (target variable) with some noise
final_gpa = 0.2 * attendance + 0.3 * study_hours + 0.3 * past_cgpa + 0.1 * quiz_performance + 0.1 * class_participation
final_gpa = final_gpa / 10  # Scaling down to match GPA range
final_gpa = np.clip(final_gpa, 2.0, 4.0)  # Ensuring GPA stays within valid range


In [18]:
# Creating a DataFrame
data = pd.DataFrame({
    'Attendance (%)': attendance,
    'Study Hours (per week)': study_hours,
    'Past CGPA': past_cgpa,
    'Quiz Performance (%)': quiz_performance,
    'Class Participation (1-10)': class_participation,
    'Final GPA': final_gpa
})

In [19]:
# Introducing some missing values (around 10% missing per feature)
for col in data.columns:
    data.loc[data.sample(frac=0.1).index, col] = np.nan

# Show the first few rows of the simulated dataset
data.head()


Unnamed: 0,Attendance (%),Study Hours (per week),Past CGPA,Quiz Performance (%),Class Participation (1-10),Final GPA
0,68.727006,5.332393,2.523411,83.63515,6.147963,2.508045
1,97.535715,11.754217,2.493958,89.83407,,
2,86.599697,17.713025,3.812509,62.523395,7.841448,3.081408
3,79.932924,15.180048,2.499092,81.243705,2.385099,
4,57.800932,,,78.587299,2.343245,2.537184


In [20]:
# Save the DataFrame to a CSV file
data.to_csv('student_performance_data.csv', index=False)

In [21]:
# Checking for missing values in each column
missing_values = data.isnull().sum()

# Generating summary statistics for each feature
summary_stats = data.describe()

missing_values, summary_stats


(Attendance (%)                100
 Study Hours (per week)        100
 Past CGPA                     100
 Quiz Performance (%)          100
 Class Participation (1-10)    100
 Final GPA                     100
 dtype: int64,
        Attendance (%)  Study Hours (per week)   Past CGPA  \
 count      900.000000              900.000000  900.000000   
 mean        74.783401               11.203002    3.013262   
 std         14.647534                5.226003    0.584550   
 min         50.231601                2.057929    2.000023   
 25%         61.999984                6.573138    2.528842   
 50%         75.163838               11.533153    3.012224   
 75%         87.533942               15.598886    3.531353   
 max         99.985884               19.989447    3.995642   
 
        Quiz Performance (%)  Class Participation (1-10)   Final GPA  
 count            900.000000                  900.000000  900.000000  
 mean              74.503334                    5.400306    2.721216  
 s

In [22]:
# Mean imputation for missing values
data_imputed = data.fillna(data.mean())

# Standardizing the features (excluding the target 'Final GPA')
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_to_scale = ['Attendance (%)', 'Study Hours (per week)', 'Past CGPA', 
                     'Quiz Performance (%)', 'Class Participation (1-10)']
data_imputed[features_to_scale] = scaler.fit_transform(data_imputed[features_to_scale])

# Display the first few rows of the processed dataset
data_imputed.head()


Unnamed: 0,Attendance (%),Study Hours (per week),Past CGPA,Quiz Performance (%),Class Participation (1-10),Final GPA
0,-0.436084,-1.184769,-0.883817,0.672448,0.305367,2.508045
1,1.638254,0.1112427,-0.93696,1.128924,0.0,2.721216
2,0.850819,1.313811,1.442046,-0.882178,0.99704,3.081408
3,0.370785,0.802622,-0.927695,0.496347,-1.231506,2.721216
4,-1.222803,-3.58493e-16,0.0,0.300735,-1.2486,2.537184


In [23]:
# Splitting the data into features (X) and target (y)
X = data_imputed.drop(columns=['Final GPA'])
y = data_imputed['Final GPA']

# Splitting the dataset into training and testing sets (80% train, 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Importing Linear Regression model
from sklearn.linear_model import LinearRegression

# Initializing the model
model = LinearRegression()

# Training the model
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model with Mean Squared Error (MSE) and R-squared
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(0.019497741011824634, 0.8193803049379441)

In [24]:
# Introducing some missing values (around 10% missing per feature)
for col in data.columns:
    data.loc[data.sample(frac=0.1).index, col] = np.nan

# Handling missing values with mean imputation
data_imputed = data.fillna(data.mean())

# Standardizing the features (excluding the target 'Final GPA')
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_to_scale = ['Attendance (%)', 'Study Hours (per week)', 'Past CGPA', 
                     'Quiz Performance (%)', 'Class Participation (1-10)']
data_imputed[features_to_scale] = scaler.fit_transform(data_imputed[features_to_scale])

# Splitting the data into features (X) and target (y)
X = data_imputed.drop(columns=['Final GPA'])
y = data_imputed['Final GPA']

# Splitting the dataset into training and testing sets (80% train, 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Importing Linear Regression model
from sklearn.linear_model import LinearRegression

# Initializing the model
model = LinearRegression()

# Training the model
model.fit(X_train, y_train)

# Saving the trained model
with open('final_gpa_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model with Mean Squared Error (MSE) and R-squared
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(0.034659990933796354, 0.6516558479069299)