# Overview  

This is the first notebook in a series where I'll be implementing machine learning algorithms using the **Scikit-Learn** library.  
I'll begin with **Linear Regression** — let's get started!  


# Imports

In [20]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import math

# Data Loading and Analysis

In [3]:
# Load the training and testing data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [7]:
train_data.shape, test_data.shape

((700, 2), (300, 2))

In [8]:
# Checking for missing data
print(train_data.isnull().sum())
print(test_data.isnull().sum())

x    0
y    1
dtype: int64
x    0
y    0
dtype: int64


In [9]:
train_data.dropna(inplace=True)

In [11]:
print(train_data.isnull().sum())
print(test_data.isnull().sum())

x    0
y    0
dtype: int64
x    0
y    0
dtype: int64


In [13]:
train_data.head()

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


# Data Visualization

In [17]:
fig = px.scatter(x=train_data['x'], y=train_data['y'], template='seaborn')
fig.show(renderer='iframe')

# Data Preprocessing

In [18]:
# Set training and target data
X_train = train_data['x'].values
y_train = train_data['y'].values

# Set testing and target data
X_test = test_data['x'].values
y_test = test_data['y'].values

In [23]:
""" Before we scale our data, we need to make sure it has the right shape."""
print(X_train.shape, X_test.shape)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
print(X_train.shape, X_test.shape)

(699,) (300,)
(699, 1) (300, 1)


In [24]:
""" Scaling the data, so that our model converges faster"""
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
np.std(X_train), np.mean(X_train)

(np.float64(1.0), np.float64(-2.541283032046138e-17))

# Model Implementation

In [26]:
lr = LinearRegression()
# Training the model
lr.fit(X_train, y_train)

In [27]:
# Testing the model
y_pred = lr.predict(X_test)

# Evaluation

In [28]:
""" We'll be using MSE, RMSE, R2_score and adjusted R2_score as our evaluation models."""
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - ((1-r2) * (y_test.shape[0] - 1)) / (y_test.shape[0] - 1 - X_test.shape[1])

In [29]:
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (Coefficient of Determination): {r2}")
print(f"Adjusted R-squared (Coefficient of Determination): {adj_r2}")

Mean Squared Error (MSE): 9.432922192039317
Root Mean Squared Error (RMSE): 3.0713062680298293
R-squared (Coefficient of Determination): 0.9888014444327563
Adjusted R-squared (Coefficient of Determination): 0.9887638653872286
