In [24]:
import numpy as np
import pandas as pd

# Read the CSV file
data = pd.read_csv('data.csv')
column_names = data.columns.tolist()

# Print column names
print(column_names)



['SAT', 'GPA']


In [25]:
data.shape

(84, 2)

In [26]:
# Extract the independent variable (X) and dependent variable (y)
X = data['SAT'].values
y = data['GPA'].values

In [27]:
# Normalize the independent variable
X = (X - np.mean(X)) / np.std(X)

# Add a column of ones to X for the intercept term
X = np.column_stack((np.ones(len(X)), X))

# Initialize the parameters
theta = np.zeros(X.shape[1])

# Define the cost function
def cost_function(X, y, theta):
    m = len(y)
    h = np.dot(X, theta)
    J = (1 / (2 * m)) * np.sum((h - y) ** 2)
    return J

# Define the gradient descent function
def gradient_descent(X, y, theta, learning_rate, num_iterations):
    m = len(y)
    J_history = []
    for i in range(num_iterations):
        h = np.dot(X, theta)
        gradient = (1 / m) * np.dot(X.T, (h - y))
        theta = theta - learning_rate * gradient
        J_history.append(cost_function(X, y, theta))
    return theta, J_history

# Set the learning rate and number of iterations
learning_rate = 0.001
num_iterations = 10000

# Run gradient descent
theta_optimal, J_history = gradient_descent(X, y, theta, learning_rate, num_iterations)

# Print the optimal parameters
print("Optimal theta:", theta_optimal)

# Print the final cost
final_cost = cost_function(X, y, theta_optimal)
print("Final cost:", final_cost)

# Calculate the accuracy (R-squared)
y_pred = np.dot(X, theta_optimal)
ss_total = np.sum((y - np.mean(y)) ** 2)
ss_residual = np.sum((y - y_pred) ** 2)

if ss_total != 0:  # Add a check to avoid division by zero
    accuracy = 1 - (ss_residual / ss_total)
else:
    accuracy = float('inf')  # Handle the case when ss_total is zero

print("Accuracy (R-squared):", accuracy)


Optimal theta: [3.33008766 0.17202913]
Final cost: 0.02165044467682986
Accuracy (R-squared): 0.40600360351084597


y = 3.3302381 + 0.1720369 * X : The first value, 3.3302381, represents the intercept term, and the second value, 0.1720369, represents the coefficient for the independent variable X.