In [3]:
import pandas as pd                                             # import for dataframes
import numpy as np                                              # import for arrays
import matplotlib.pyplot as plt                                 # import for plotting
from sklearn import linear_model                                # importing the Linear Regression Algorithm from sklearn
from sklearn.model_selection import train_test_split            # splitting data into training set and testing set
from sklearn.preprocessing import OneHotEncoder                # import for one-hot encoding


# Link to dataset: https://archive.ics.uci.edu/ml/datasets/student+performance

#######################################################################################
# Step 1: Load the data and store the labels in variable x and target in variable y   #
#######################################################################################

student_data = pd.read_csv("sample_data/student-mat.csv", sep=";")                               # Read in the data from the csv file
#print(student_data.head())

# Experiment with different feature combinations here:
# You can add or remove features from this list to see their impact on accuracy
features_to_use = ["G1", "G2", "studytime", "failures", "absences",  "schoolsup", "famsup", "paid", "activities", "higher", "internet", "romantic", "famrel", "freetime", "goout", "Dalc", "Walc", "health"] # Add more potentially relevant features


student_data = student_data[features_to_use + ["G3"]] # extract features to be used for this model, including the target variable "G3"
#print(student_data.head())

# Identify categorical features
categorical_features = ["schoolsup", "famsup", "paid", "activities", "higher", "internet", "romantic"]

# Create a OneHotEncoder object
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # sparse=False for dense output

# Fit and transform the categorical features
encoded_data = encoder.fit_transform(student_data[categorical_features])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))

# Drop the original categorical features and concatenate the encoded features
student_data = student_data.drop(categorical_features, axis=1)
student_data = pd.concat([student_data, encoded_df], axis=1)

# Now, student_data contains the encoded features and is ready for model training
X = np.array(student_data.drop(["G3"], axis=1)) # Change the second argument to axis=1
Y = np.array(student_data["G3"])

#######################################################################################
# Step 2: Split the dataset into a training set and test set.                         #
#         70% of the data should be for the training set.                             #
#         30% of the data should be for the test set.                                 #
#######################################################################################

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

##################################
# Step 3: Create the model.      #
##################################

linear_regression_model = linear_model.LinearRegression()

##################################
# Step 4: Train the model.       #
##################################

linear_regression_model.fit(x_train, y_train)

##################################
# Step 5: Test the model.        #
##################################

y_predict = linear_regression_model.predict(x_test)

##################################
# Step 6: Evaluate the model.    #
##################################

accuracy = linear_regression_model.score(x_test, y_test)
accuracy = accuracy * 100
accuracy = round(accuracy, 2)

print(f"\nAccuracy of the model is: {accuracy}%\n")

##############################################################################
# Step 7: Print out the result, input parameters, and expected result        #
##############################################################################

print("Predicted Value, [Input Data], Actual Value")
for i in range(len(y_predict)):
  print(int(y_predict[i]), x_test[i], y_test[i])


Accuracy of the model is: 82.44%

Predicted Value, [Input Data], Actual Value
7 [ 9.  7.  2.  1. 20.  5.  2.  4.  1.  4.  5.  1.  0.  0.  1.  1.  0.  0.
  1.  0.  1.  0.  1.  1.  0.] 8
12 [11. 13.  2.  0.  2.  4.  3.  3.  1.  1.  5.  1.  0.  0.  1.  0.  1.  1.
  0.  0.  1.  0.  1.  1.  0.] 13
10 [12. 12.  2.  0.  2.  1.  2.  3.  1.  2.  5.  1.  0.  1.  0.  1.  0.  1.
  0.  0.  1.  1.  0.  0.  1.] 11
12 [14. 13.  4.  0.  0.  4.  3.  3.  1.  1.  3.  1.  0.  0.  1.  0.  1.  1.
  0.  0.  1.  0.  1.  1.  0.] 14
13 [13. 13.  1.  0.  0.  5.  5.  5.  3.  2.  5.  0.  1.  1.  0.  1.  0.  1.
  0.  0.  1.  0.  1.  1.  0.] 12
-2 [4. 0. 1. 2. 0. 4. 3. 2. 1. 1. 5. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.
 1.] 0
7 [7. 8. 2. 0. 0. 4. 3. 2. 2. 3. 2. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0.
 1.] 0
16 [15. 16.  1.  0.  4.  2.  2.  4.  2.  4.  1.  1.  0.  1.  0.  0.  1.  1.
  0.  0.  1.  0.  1.  1.  0.] 15
12 [15. 13.  3.  2. 14.  4.  1.  2.  1.  1.  3.  1.  0.  1.  0.  1.  0.  1.
  0.  0.  1.  0.  1.  0.  1.