# ASSESSMENT 2

## Implement the Linear Regression algorithm from scratch in Python using Numpy and Pandas and Matplotlib for visualization.

## The algorithm must be implemented as a function with arguments, x_train (the features) and y_train (the output).  Usage of any library that has an implementation is forbidden.

## The code must be uploaded to the portal and GitHub and be in Python Notebook format (.ipynb file). The GitHub link is to be attached. The repository has to be open.

## Evaluation will be done on the code formatting, the final accuracy results, and small explanations of any data preprocessing done (data preprocessing and visualization has bonus points). The explanations must also be included in the same python notebook which can be done using markdown cells.

### Reading the Database

In [1]:
# importing required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Medical Price Dataset.csv")
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [4]:
df.shape

(1338, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
# Checking the null values

df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

### Preprocessing the data now

In [8]:
# Encoding categorical data

df['sex'] = df['sex'].map({
    'male': 0, 
    'female': 1
})

In [9]:
df['smoker'] = df['smoker'].map({
    'no': 0, 
    'yes': 1
})

In [10]:
df['region'] = df['region'].map({
    'southwest': 1, 
    'southeast': 2,
    'northwest': 3,
    'northeast': 4 
})

In [11]:
# Feature Selection

df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,1,16884.924


In [12]:
corr = df.corr()['charges']
print(corr.sort_values(ascending=False))

charges     1.000000
smoker      0.787251
age         0.299008
bmi         0.198341
children    0.067998
region      0.006208
sex        -0.057292
Name: charges, dtype: float64


In [13]:
df['age_smoker'] = df['age'] * df['smoker']
df['bmi_smoker'] = df['bmi'] * df['smoker']

In [14]:
corr = df.corr()['charges']
print(corr.sort_values(ascending=False))

charges       1.000000
bmi_smoker    0.845120
age_smoker    0.789253
smoker        0.787251
age           0.299008
bmi           0.198341
children      0.067998
region        0.006208
sex          -0.057292
Name: charges, dtype: float64


### Designing the Linear Regression algorithm

In [15]:
from sklearn.model_selection import train_test_split

# Feature Selection
X = df[['bmi_smoker', 'smoker', 'age','bmi']]
y = df['charges']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X.shape

(1338, 4)

In [17]:
y.shape

(1338,)

In [18]:
X_train_mean = x_train.mean()
X_train_std = x_train.std()

X_train_scaled = (x_train - X_train_mean) / X_train_std
X_test_scaled = (x_test - X_train_mean) / X_train_std

In [19]:
def linear_regression(x_train, y_train):
    X = x_train.values
    y = y_train.values.reshape(-1, 1)
    X_b = np.c_[np.ones((len(X), 1)), X]  # Add intercept term
    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
    return theta.flatten()

def predict(x, theta):
    X = x.values
    X_b = np.c_[np.ones((len(X), 1)), X]
    return X_b.dot(theta)

### Training the model

In [20]:
infer = linear_regression(X_train_scaled, y_train)

In [21]:
y_pred = predict(X_test_scaled, infer)

### Performance Evaluation

In [22]:
ans = pd.DataFrame({'Actual': y_test.to_numpy().flatten(),'Predicted': y_pred.flatten()})
print(ans.head(10))

        Actual     Predicted
0   9095.06825   9933.222188
1   5272.17580   7581.345229
2  29330.98315  33390.341576
3   9301.89355  10204.248587
4  33750.29180  28782.441449
5   4536.25900   7162.079010
6   2117.33885   2993.032578
7  14210.53595  15099.363899
8   3732.62510   5341.581512
9  10264.44210  11028.143864


In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

MAE: 2852.29
MSE: 21897324.42
RMSE: 4679.46
R² Score: 0.86
