<a href="https://colab.research.google.com/github/Jonathan-code-hub/MAT-421-Applied-Computational-Methods/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To perform Linear Regression, we want the variable to be either dichotomous or continuous. Running this code bit converts dichotomous variables to 1 and 0 to make Linear Regression possible.

In [None]:
import pandas as pd

def convert_categorical_to_numeric(csv_file_path):
    """

    Parameters:
    - csv_file_path  string, the path of the CSV file

    Output:
    -* Rewrites CSV file to have dichotomous variables in 0,1 form

    """

    df = pd.read_csv(csv_file_path)

    # Iterate through each column #
    for column in df.columns:
        # Check if the column has only two unique non-numeric values #
        if df[column].dtype == 'object' and len(df[column].unique()) == 2:
            # Map the values to 0 and 1 #
            mapping = {value: idx for idx, value in enumerate(df[column].unique())}
            df[column] = df[column].map(mapping)

    # Write back to the CSV file #
    df.to_csv(csv_file_path, index = False)

# Testing #
convert_categorical_to_numeric("/content/drive/MyDrive/Student Performance/student performance.csv") # Replace this with csv file path #

We are using sklearn's linear regression model here, mostly to save space.

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

def perform_linear_regression(df, y_column=-1):
    """

    Parameters:
    -df  csv file,
    ~y_column  csv file column,

    Output:
    -linear_functions  array, this array contains column name, slope, y_intercept for each column

    """

    linear_functions = []
    X = df.select_dtypes(include='number').drop(columns=[df.columns[y_column]])
    y = df.iloc[:, y_column]

    for column in X.columns:
        model = LinearRegression()
        model.fit(df[[column]], y)
        slope = model.coef_[0]
        intercept = model.intercept_
        linear_functions.append((column, slope, intercept))

    return linear_functions


# Testing #
# Load CSV file #
csv_file_path = '/content/drive/MyDrive/Student Performance/student performance.csv'  # Update with your CSV file path #
df = pd.read_csv(csv_file_path)

# Perform linear regression for each numeric column #
linear_functions = perform_linear_regression(df)

# Print the linear functions #
for function in linear_functions:
    print(f"Linear function for {function[0]}: y = {function[1]}x + {function[2]}")

Linear function for school: y = -1.926389673424131x + 12.576832151300236
Linear function for sex: y = -0.8472486699778163x + 12.2532637075718
Linear function for age: y = -0.2824658675725914x + 16.635680405102235
Linear function for address: y = -1.176979920039531x + 12.263274336283185
Linear function for famsize: y = 0.3183921407731588x + 11.811816192560174
Linear function for Pstatus: y = -0.007403339191564216x + 11.9125
Linear function for Medu: y = 0.6838333905988494x + 10.186415880651275
Linear function for Fedu: y = 0.6220863043529533x + 10.471089063765222
Linear function for traveltime: y = -0.548783281576134x + 12.766812604999236
Linear function for studytime: y = 0.972841562534557x + 10.027780465553468
Linear function for failures: y = -2.1419285965244192x + 12.381259965946867
Linear function for schoolsup: y = 0.699934190543687x + 11.279411764705882
Linear function for famsup: y = 0.3924502993052917x + 11.665338645418327
Linear function for paid: y = -0.7456914670029422x + 11

We can test the data against and get predicted grades for all students. The predictions array is just an average of each linear regression run across the row.

In [None]:
from ast import YieldFrom
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

def perform_linear_regression(df, y_column=-1):
    """

    Parameters:
    -df  csv file,
    ~y_column  csv file column,

    Output:
    -linear_functions  array, this array contains column name, slope, y_intercept for each column

    """

    linear_functions = []
    X = df.select_dtypes(include='number').drop(columns=[df.columns[y_column]])
    y = df.iloc[:, y_column]

    for column in X.columns:
        model = LinearRegression()
        model.fit(df[[column]], y)
        slope = model.coef_[0]
        intercept = model.intercept_
        linear_functions.append((column, slope, intercept))

    return linear_functions

def f(x,a,b):
  return a*x + b

# Testing #
# Load CSV file #
csv_file_path = '/content/drive/MyDrive/Student Performance/student performance.csv'  # Update with your CSV file path #
df = pd.read_csv(csv_file_path)

# Perform linear regression for each numeric column #
linear_functions = perform_linear_regression(df)


# We are gonna presume the last column is the column we with the data we are predicting #
X = df.select_dtypes(include='number').drop(columns=[df.columns[-1]])

Y = df.iloc[:, -1]

average_prediction = np.array([])

predictions = np.zeros(X.shape)

for index, row in X.iterrows():
      n = 0
      sum = 0
      for function in linear_functions: # The second for loop goes through each each linear regression function and calculates the predicted value for each row #
        predictions[index, n] = f(row[n],function[1],function[2])
        sum += predictions[index, n]
        n += 1
      average_prediction = np.append(average_prediction , sum/n) # Predictions are averaged #

# Mean Absolute Error #
mae = np.mean(np.abs(average_prediction - Y))

# Mean Squared Error #
mse = np.mean((average_prediction - Y)**2)

# Root Mean Square Error #
rmse = np.sqrt(((average_prediction - Y) ** 2).mean())

"""
# R Squared Error #
ss_res = np.sum((average_prediction - Y)**2)
ss_tot = np.sum((Y - np.mean(Y))**2)

r_squared = 1 - (ss_res / ss_tot)
"""

# Results #
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Square Error", rmse)
# print("R Squared Error:", r_squared)

Mean Absolute Error: 2.1793189115876195
Mean Squared Error: 8.824556358599914
Root Mean Square Error 2.9706154848111717


But what if we weight the prediction relative to the predictive power of each variable? We can try to weight by the MAE, MSE, and R Squared.

In [None]:
from ast import YieldFrom
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

def perform_linear_regression(df, y_column=-1):
    """

    Parameters:
    -df  csv file,
    ~y_column  csv file column,

    Output:
    -linear_functions  array, this array contains column name, slope, y_intercept for each column

    """

    linear_functions = []
    X = df.select_dtypes(include='number').drop(columns=[df.columns[y_column]])
    y = df.iloc[:, y_column]

    for column in X.columns:
        model = LinearRegression()
        model.fit(df[[column]], y)
        slope = model.coef_[0]
        intercept = model.intercept_
        linear_functions.append((column, slope, intercept))

    return linear_functions

def f(x,a,b):
  return a*x + b

# Testing #
# Load CSV file #
csv_file_path = '/content/drive/MyDrive/Student Performance/student performance.csv'  # Update with your CSV file path #
df = pd.read_csv(csv_file_path)

# Perform linear regression for each numeric column #
linear_functions = perform_linear_regression(df)

# We are gonna presume the last column is the column we with the data we are predicting #
X = df.select_dtypes(include='number').drop(columns=[df.columns[-1]])

Y = df.iloc[:, -1]

average_prediction = np.array([])

predictions = np.zeros(X.shape)

for index, row in X.iterrows():
      n = 0
      sum = 0
      for function in linear_functions: # The second for loop goes through each each linear regression function and calculates the predicted value for each row #
        predictions[index, n] = f(row[n],function[1],function[2])
        n += 1

mae_weight = np.zeros(n)

mse_weight = np.zeros(n)

rmse_weight = np.zeros(n)

# r_sq_weight = np.zeros(n)


for m in range(n):
  # Mean Absolute Error #
  mae_weight[m] = np.mean(np.abs(predictions[:, m] - Y))**(-2)

  # Mean Squared Error #
  mse_weight[m] = np.mean((predictions[:, m] - Y)**2)**(-2)

  # RMSE #
  rmse_weight[m] = np.sqrt(((predictions[:, m] - Y) ** 2).mean())**(-2)

"""
  # R Squared #
  ss_res = np.sum((predictions[:, m] - Y)**2)
  ss_tot = np.sum((Y - np.mean(Y))**2)

  r_sq_weight[m] = (1 - (ss_res / ss_tot))
"""

print(mae_weight)
print(mse_weight)
print(rmse_weight)
# print(r_sq_weight)

mae_weight = mae_weight * (1/np.sum(mae_weight))
mse_weight = mse_weight * (1/np.sum(mse_weight))
rmse_weight = rmse_weight * (1/np.sum(rmse_weight))
# r_sq_weight = r_sq_weight * (1/np.sum(r_sq_weight))

# print(mae_weight)
# print(mse_weight)
# print(rmse_weight)
# print(r_sq_weight)


for index, row in X.iterrows():
      n = 0
      sum = 0
      for function in linear_functions: # The second for loop goes through each each linear regression function and calculates the predicted value for each row #
        predictions[index, n] = f(row[n],function[1],function[2]) * mae_weight[n]
        sum += predictions[index, n]
        n += 1
      average_prediction = np.append(average_prediction , sum) # Predictions are averaged #

# Mean Absolute Error #
mae = np.mean(np.abs(average_prediction - Y))

# Mean Squared Error #
mse = np.mean((average_prediction - Y)**2)

# Root Mean Square Error #
rmse = np.sqrt(((average_prediction - Y) ** 2).mean())

"""
# R Squared Error #
ss_res = np.sum((average_prediction - Y)**2)
ss_tot = np.sum((Y - np.mean(Y))**2)

r_squared = 1 - (ss_res / ss_tot)
"""


# Results for Mean Absolute #
print("Weighted Results")
print("Mean Absolute Weights")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Square Error", rmse)
# print("R Squared Error:", r_squared)

average_prediction = np.array([])

for index, row in X.iterrows():
      n = 0
      sum = 0
      for function in linear_functions: # The second for loop goes through each each linear regression function and calculates the predicted value for each row #
        predictions[index, n] = f(row[n],function[1],function[2])  * mse_weight[n]
        sum += predictions[index, n]
        n += 1
      average_prediction = np.append(average_prediction , sum) # Predictions are averaged #

# Mean Absolute Error #
mae = np.mean(np.abs(average_prediction - Y))

# Mean Squared Error #
mse = np.mean((average_prediction - Y)**2)

# Root Mean Square Error #
rmse = np.sqrt(((average_prediction - Y) ** 2).mean())

"""
# R Squared #
ss_res = np.sum((average_prediction - Y)**2)
ss_tot = np.sum((Y - np.mean(Y))**2)

r_squared = 1 - (ss_res / ss_tot)
"""

# Results for Root Mean Squared #
print("Mean Squared Weights:")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Square Error", rmse)
# print("R Squared Error:", r_squared)

average_prediction = np.array([])

for index, row in X.iterrows():
      n = 0
      sum = 0
      for function in linear_functions: # The second for loop goes through each each linear regression function and calculates the predicted value for each row #
        predictions[index, n] = f(row[n],function[1],function[2]) * rmse_weight[n]
        sum += predictions[index][n]
        n += 1
      average_prediction = np.append(average_prediction, sum) # Predictions are averaged #


# Mean Absolute Error #
mae = np.mean(np.abs(average_prediction - Y))

# Mean Squared Error #
mse = np.mean((average_prediction - Y)**2)

# Root Mean Square Error #
rmse = np.sqrt(((average_prediction - Y) ** 2).mean())

"""
# R Squared #
ss_res = np.sum((average_prediction - Y)**2)
ss_tot = np.sum((Y - np.mean(Y))**2)

r_squared = 1 - (ss_res / ss_tot)
"""

# Results for RMSE #
print("Root Mean Square Weights:")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Square Error", rmse)
# print("R Squared:", r_squared)

"""
average_prediction = np.array([])

for index, row in X.iterrows():
      n = 0
      sum = 0
      for function in linear_functions: # The second for loop goes through each each linear regression function and calculates the predicted value for each row #
        predictions[index, n] = f(row[n],function[1],function[2]) * r_sq_weight[n]
        sum += predictions[index][n]
        n += 1
      average_prediction = np.append(average_prediction, sum) # Predictions are averaged #


# Mean Absolute Error #
mae = np.mean(np.abs(average_prediction - Y))

# Mean Squared Error #
mse = np.mean((average_prediction - Y)**2)

# Root Mean Square Error #
rmse = np.sqrt(((average_prediction - Y) ** 2).mean())


# R Squared #
ss_res = np.sum((average_prediction - Y)**2)
ss_tot = np.sum((Y - np.mean(Y))**2)

r_squared = 1 - (ss_res / ss_tot)



# Results for R Squared #
print("R Squared Weights")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Square Error", rmse)
# print("R Squared:", r_squared)
"""

print()

[0.18972963 0.17598708 0.17295469 0.18041865 0.17128123 0.17276355
 0.19042293 0.18528736 0.17820481 0.19078453 0.20383474 0.1744678
 0.17226226 0.17306465 0.17411924 0.17365812 0.19242559 0.17921878
 0.17211011 0.17394571 0.17643151 0.1747914  0.1805325  0.18021457
 0.17487322 0.17795378 0.6909899  1.53162632]
[0.01089879 0.00952291 0.00942075 0.00974844 0.00924567 0.00920825
 0.01036985 0.01009349 0.00951347 0.01047456 0.012887   0.00928998
 0.00927313 0.00926399 0.00927443 0.00922348 0.01163393 0.00963717
 0.00936123 0.00928262 0.00949191 0.00935134 0.01003145 0.00981077
 0.00939087 0.00936396 0.09158553 0.37707486]
[0.10439727 0.09758542 0.09706054 0.0987342  0.0961544  0.0959596
 0.10183247 0.10046638 0.09753701 0.10234532 0.1135209  0.09638456
 0.0962971  0.09624962 0.09630384 0.09603894 0.10786068 0.09816908
 0.09675343 0.09634634 0.09742645 0.09670231 0.10015714 0.09904931
 0.09690648 0.09676757 0.30263101 0.61406421]
Weighted Results
Mean Absolute Weights
Mean Absolute Error: 

Least Squares Multivariate Regression

In [None]:
import pandas as pd
import numpy as np


# Testing #
# Load CSV file #
csv_file_path = '/content/drive/MyDrive/Student Performance/student performance.csv'  # Update with your CSV file path #
df = pd.read_csv(csv_file_path)

# We are gonna presume the last column is the column we with the data we are predicting #
X = df.select_dtypes(include='number').drop(columns=[df.columns[-1]])

X = X.values[0:]

X = np.hstack((X, np.ones((X.shape[0], 1)))) # We add a column of ones to the end to add the $+ b$ to the analysis #

Y = df.iloc[0:, -1]

Y = Y.values[0:]

Beta = np.matmul(np.transpose(X),X) # Multiply the transpose of X and X to get a square matrix #

Beta = np.matmul(np.linalg.inv(Beta), np.transpose(X)) # Invert the matrix and multiply it by the transpose of X #

Beta = np.matmul(Beta, Y) # Multiply matrix by Y #

X_orig = df.select_dtypes(include='number').drop(columns=[df.columns[-1]])

prediction = np.array([])

for index, row in X_orig.iterrows():
      prediction = np.append(prediction, np.sum(np.dot(Beta[:-1], row)) + Beta[-1])

# Mean Absolute Error #
mae = np.mean(np.abs(prediction - Y))

# Mean Squared Error for Multivariate Analysis #
mse = np.mean((prediction - Y)**2)

# Root Mean Square Error #
rmse = np.sqrt(((prediction - Y) ** 2).mean())


"""
# R Squared #
ss_res = np.sum((prediction - Y)**2)
ss_tot = np.sum((Y - np.mean(Y))**2)

r_squared = 1 - (ss_res / ss_tot)
"""

# Results #
print("Multivariate Results:")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Square Error", rmse)
# print("R Squared Error:", r_squared)

Multivariate Results:
Mean Absolute Error: 0.7699837670623104
Mean Squared Error: 1.497677410326572
Root Mean Square Error 1.223796310799543


This dummy codes the data.

In [None]:
import pandas as pd

def dummy_convert(csv_file_path):
    """
    Converts categorical columns to dummy variables in the given CSV file.

    Parameters:
    - csv_file_path: string, the path of the CSV file

    Output:
    - Rewrites CSV file to have dummy coded columns for categorical variables
    """

    df = pd.read_csv(csv_file_path)

    # Iterate through each column #
    for column in df.columns:
        # Check if the column has non-numeric values #
        if df[column].dtype == 'object':
            # Get dummy variables for the column #
            dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
            # Concatenate the dummy variables with the original DataFrame #
            df = pd.concat([df, dummies], axis=1)
            # Drop the original column #
            df.drop(column, axis=1, inplace=True)
        elif df[column].dtype == 'bool':
            # Convert boolean values to 1s and 0s #
            df[column] = df[column].astype(int)

    # Write back to the CSV file #
    df.to_csv(csv_file_path, index=False)



# Testing #

dummy_convert("/content/drive/MyDrive/Student Performance/dummy student performance (1).csv") # Replace this with csv file path #


Run this to hot encode data

In [None]:
import pandas as pd

def convert_categorical_to_numeric(csv_file_path):
    """

    Parameters:
    - csv_file_path  string, the path of the CSV file

    Output:
    -* Rewrites CSV file to have dichotomous variables in 0,1 form

    """

    df = pd.read_csv(csv_file_path)

    # Iterate through each column #
    for column in df.columns:
        # Check if the column has only two unique non-numeric values #
        if df[column].dtype == 'object' and len(df[column].unique()) == 2:
            # Map the values to 0 and 1 #
            mapping = {value: idx for idx, value in enumerate(df[column].unique())}
            df[column] = df[column].map(mapping)

    # Write back to the CSV file #
    df.to_csv(csv_file_path, index = False)

convert_categorical_to_numeric("/content/drive/MyDrive/Student Performance/dummy student performance (1).csv") # And replace this with csv file path #


Run this to move G3 to the back:

In [None]:
def move_column_to_last(csv_file_path, column_name="G3"):
    """
    Moves the specified column to the last column of the DataFrame and writes the modified DataFrame back to the CSV file.

    Parameters:
    - csv_file_path: string, the path of the CSV file
    - column_name: string, the name of the column to be moved (default is "G3")

    Output:
    - Rewrites CSV file with the specified column moved to the last column
    """

    df = pd.read_csv(csv_file_path)

    # Check if the specified column exists in the DataFrame #
    if column_name in df.columns:
        # Move the specified column to the last position #
        cols = list(df.columns)
        cols.remove(column_name)
        cols.append(column_name)
        df = df[cols]

        # Write back to the CSV file #
        df.to_csv(csv_file_path, index=False)
        print(f"Column '{column_name}' moved to the last position successfully.")
    else:
        print(f"Column '{column_name}' not found in the DataFrame.")

move_column_to_last("/content/drive/MyDrive/Student Performance/dummy student performance (1).csv") # Replace this with csv file path #

Column 'G3' moved to the last position successfully.


In [None]:
import pandas as pd
import numpy as np


# Testing #
# Load CSV file #
csv_file_path = '/content/drive/MyDrive/Student Performance/dummy student performance (1).csv'  # Update with your CSV file path #
df = pd.read_csv(csv_file_path)

# We are gonna presume the last column is the column we with the data we are predicting #
X = df.select_dtypes(include='number').drop(columns=[df.columns[-1]])

X = X.values[0:]

X = np.hstack((X, np.ones((X.shape[0], 1)))) # We add a column of ones to the end to add the $+ b$ to the analysis #

Y = df.iloc[0:, -1]

Y = Y.values[0:]

Beta = np.matmul(np.transpose(X),X) # Multiply the transpose of X and X to get a square matrix #

Beta = np.matmul(np.linalg.inv(Beta), np.transpose(X)) # Invert the matrix and multiply it by the transpose of X #

Beta = np.matmul(Beta, Y) # Multiply matrix by Y #

X_orig = df.select_dtypes(include='number').drop(columns=[df.columns[-1]])

prediction = np.array([])

for index, row in X_orig.iterrows():
      prediction = np.append(prediction, np.sum(np.dot(Beta[:-1], row)) + Beta[-1])

# Mean Absolute Error #
mae = np.mean(np.abs(prediction - Y))

# Mean Squared Error for Multivariate Analysis #
mse = np.mean((prediction - Y)**2)

# Root Mean Square Error #
rmse = np.sqrt(((prediction - Y) ** 2).mean())


"""
# R Squared #
ss_res = np.sum((prediction - Y)**2)
ss_tot = np.sum((Y - np.mean(Y))**2)

r_squared = 1 - (ss_res / ss_tot)
"""

# Results #
print("Multivariate Results:")
print("R Squared Weights")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Square Error", rmse)
# print("R Squared Error:", r_squared)

Multivariate Results:
R Squared Weights
Mean Absolute Error: 0.7615986061532586
Mean Squared Error: 1.4588533674625568
Root Mean Square Error 1.2078300242428803
