# Linear Regression from scratch without using sk-learn, PyTorch or TensorFlow

In [23]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv('../data/WineQT.csv')
df = df.drop('Id', axis = 1) # To drop the Id column

## Train-Test Split

In [25]:
nrow = df.shape[0]
# Here, I initialize an index to train my model on 75% of the data to test it afterward on the 25% left
train_test_index = int(nrow * 0.75)

train_df = df.iloc[:train_test_index]
test_df = df.iloc[train_test_index:]

In [26]:
X_train = train_df.iloc[:,:-1]
y_train = train_df.iloc[:,-1:]

X_test = test_df.iloc[:,:-1]
y_test = test_df.iloc[:,-1:]

## Normalize the data

In [27]:
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)

X_train_norm = (X_train - mean) / std
X_test_norm = (X_test - mean) / std


## Train the model

In [22]:
X_train_numpy = X_train_norm.to_numpy()
X_train_numpy = np.c_[np.ones((X_train_numpy.shape[0], 1)), X_train_numpy]

y_train_numpy = y_train.to_numpy()

my_model = np.linalg.inv(X_train_numpy.T @ X_train_numpy)@X_train_numpy.T @ y_train_numpy



## Predict the test-data

In [28]:
X_test_numpy = X_test_norm.to_numpy()
X_test_numpy = np.c_[np.ones((X_test_numpy.shape[0], 1)), X_test_numpy]

y_test_numpy = y_test.to_numpy()

y_hat_numpy = X_test_numpy @ my_model

## Accuracy of the model

In [29]:
y_test_mean = y_test_numpy.mean()

SSR = np.sum((y_test_numpy - y_hat_numpy)**2)
SST = np.sum((y_test_numpy - y_test_mean)**2)

R2 = 1 - (SSR/SST)
print(f"R-squared : {R2:.2f}")

RMSE = np.sqrt(SSR/y_test.shape[0])
print(f"RMSE : {RMSE:.2f}")


R-squared : 0.24
RMSE : 0.66
