# Assignment 2 Machine Learning
## Multiple Linear Regression without using Scikit - Learn

# Libraries

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# Dataset

In [70]:
data = pd.read_csv('home.txt', header = None, names =  ['size', 'bedroom', 'price'])
data.head()

Unnamed: 0,size,bedroom,price
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


# Assignment No 1

### Statistical summary of the data

In [71]:
data.describe()

Unnamed: 0,size,bedroom,price
count,47.0,47.0,47.0
mean,2000.680851,3.170213,340412.659574
std,794.702354,0.760982,125039.899586
min,852.0,1.0,169900.0
25%,1432.0,3.0,249900.0
50%,1888.0,3.0,299900.0
75%,2269.0,4.0,384450.0
max,4478.0,5.0,699900.0


### Data types

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   size     47 non-null     int64
 1   bedroom  47 non-null     int64
 2   price    47 non-null     int64
dtypes: int64(3)
memory usage: 1.2 KB


## Modeling

### Splitting Independent and Dependent Variable

In [73]:
data = data.sample(frac = 1, random_state=0)
X = data.iloc[:, :-1].values
y = data.iloc[:,  -1].values

### Splitting test and training dataset

In [74]:
divider = int(len(data) * 0.8)
X_train = X[:divider]
X_test = X[divider:]
y_train = y[:divider]
y_test = y[divider:]

### Feature Scaling

In [75]:
mean = np.mean(X_train, axis=0)
std_dev = np.std(X_train, axis=0)

X_train = (X_train - mean) / std_dev
X_test = (X_test - mean) / std_dev

Feature scaling is necesseary because the independent variable has a different range in term of its value, so we will be using StandardScaler to have all the independent feature in the same range of scale. <br>
We use StandardScaler because its ensure all the features contribute equally to the analysis.

### Making The Model

In [76]:
class LinearRegression:
    def __init__(self):
        self.intercept_ = None
        self.coeff_ = None

    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis = 1)
        betas = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)
        self.intercept_ = betas[0]
        self.coeff_ = betas[1:]

    def predict(self, X):
        y_pred = np.dot(X, self.coeff_) + self.intercept_
        return y_pred

### Fitting

In [77]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Prediction

### Making Prediction

In [78]:
y_pred = regressor.predict(X_test)

## Evaluation

### Evaluation of the Model

R2 Score

In [79]:
def r2_score(y_true, y_pred):
    ss_residual = np.sum((y_true - y_pred) ** 2)
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_residual / ss_total)
    return r2

Mean Absolute Error

In [80]:
def mean_absolute_error(y_true, y_pred):
    mae = np.mean(np.abs(y_true - y_pred))
    return mae

Root mean Squarred Error

In [81]:
def root_mean_squared_error(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return rmse

In [82]:
print('Model Evaluation with R2Score: ', r2_score(y_test, y_pred))
print('Model Evaluation with MAE: ', mean_absolute_error(y_test, y_pred)) 
print('Model Evaluation with RMSE: ', root_mean_squared_error(y_test, y_pred)) 

Model Evaluation with R2Score:  0.7023616105300725
Model Evaluation with MAE:  51991.62760647411
Model Evaluation with RMSE:  63045.642663314866


In [83]:
np.printoptions(precision=2)
y_pred_reshaped = y_pred.reshape(-1, 1)
y_test_reshaped = y_test.reshape(-1, 1)
result = np.concatenate((y_pred_reshaped, y_test_reshaped), axis=1)
print(result)

[[329086.44143211 259900.        ]
 [244896.87318392 229900.        ]
 [343616.58762908 255000.        ]
 [471601.03560062 599000.        ]
 [278892.40313057 242500.        ]
 [355749.60217459 287000.        ]
 [240097.50425028 239500.        ]
 [280691.52827685 232000.        ]
 [360215.04339511 399900.        ]
 [205501.41498308 179900.        ]]


# Assignment No 2

### Machine Learning model
<img src = 'overfitting_2.png'>
<img src ='model-complexity-vs-model-overfitting-vs-model-accuracy.png'>

### How to avoid Underfitting?<br>
Underfitting happens when a model <b> has not learned the patterns in the training data well and is unable to generalize well on the new data</b><br>
1. Underfit model has <b>poor training data performance </b> making it unreliable for prediction.<br>
2. Happens because <b>high bias and low variance</b>.

<b>There are several ways to avoid underfiting such as:</b><br>
1. Increase Model Complexity<br>
Using a more complex model that can capture more patterns in the data.<br>
For startes, if we are using Linear Regression to predict something, consider using a Polynomal Regression or more sophisticated algorithms like decisions tree, random forest, etc that can capture more pattern in the given data.
<br>

2. Regularization<br>
Regularization techniques like L1 and L2 regularization are used to prevent overfitting by penalizing large coefficients. <br>

3. Increase Training Data<br>
Insufficient training data can also lead to underfitting, especially if done by complex model. it would be good to collect more data with more variations to provide the model a good learning source. <br>

4. Cross - Validation<br>
Techniques like cross - validation will evaluate model's performance on multiple subsets of data, if there too many poor performance on different datasets, then the model is underfit, and needs to be fixed.<br>

5. Feature engineering<br>
Add more relevant feature to the model / data and remove unrelevant feature. By doing Feature Engineering, we can provide more information to the model.<br>

6. Reduce Noise <br>
Noisy features or outliers can hinder model's ability to learn meaningful patterns. We need to preprocess data to remove outliers, smooth noisy features.

source:
1. https://www.simplilearn.com/tutorials/machine-learning-tutorial/overfitting-and-underfitting<br>
2. https://www.javatpoint.com/overfitting-and-underfitting-in-machine-learning<br>
3. https://www.ibm.com/topics/underfitting#:~:text=Underfitting%20occurs%20when%20a%20model,poor%20performance%20of%20the%20model.<br>
4. https://algorit.ma/blog/data-science/overfitting-underfitting/<br>

### Explain two types of Regularization techniques

Regularizations are technique to calibrate machine learning models to <b>minimize adjusted loss function, and penalizing large coefficients</b> in order to avoid <b>underfitting or overfitting.</b>

Penalty 

<img src = '1_zMLv7EHYtjfr94JOBzjqTA.png'>

There are two types of regularization:
1. L1 Regularization also known as <b>Lasso Regression</b><br>
    a. L1 Regularization adds the penalty to the cost function, the absolute values of the model coefficients.<br>
    b. The penalty term is calculated as the sum of the absolute values of the coefficients multiplied by regularization parameter (lambda or alpha).<br>
    c. L1 Regularization encourages sparsity in the model, meaning it tends to force the coefficients of less importants feature to zero, effectively performing feature selection.<br>
    d. L1 Regularization is useful when dealing with high - dimensional datasets with many irrelevant features.

2. L2 Regression, also known as <b>Ridge Regression</b><br>
    a. L2 Regularization adds a penalty term to the cost function proportional to the squared values of model coefficients.<br>
    b. The penalty term is calculated of the squared values of the coefficients multiplied by regularization parameter (lambda or alpha).<br>
    c. L2 Regularization penalize the large coefficients, making it more smooth rather penalize the small coefficients.<br>
    d. L2 does not enforce sparcity in the model, rather it shrinks the coefficients of all features toward zero, but rarely to exaclty zero.<br>
    e. L2 Regularization is effective at reducing the impact of multicollinearity (correlation between features) by spreadubg the coefficients weight accross correlated features.


Source<br>
https://www.simplilearn.com/tutorials/machine-learning-tutorial/regularization-in-machine-learning#:~:text=Regularization%20refers%20to%20techniques%20that,on%20an%20over%2Dfitted%20model<br>
https://www.javatpoint.com/regularization-in-machine-learning<br>
https://towardsdatascience.com/regularization-in-machine-learning-76441ddcf99a

Calculate output k-fold Cross-Validation
<img src = '1_PdwlCactbJf8F8C7sP-3gw.png'>

To do k fold, there are several steps needs to do before doing k fold validation such as:
1. Randoming / shuffling sample data<br>
    We need to shuffle the datasets that is randomly distributed across folds to prvent bias in the cross - validation process.

2. Splitting Data into Training and Test Sets<br>
    Before performing k - fold cross - validation, we need to separate test set to evaluate the model's final performacne after hyperparameter tuning.

3. Define K<br>
    To perform how many K - fold cross validation 

Cross validation procedure:
1. Divide the training data into k folds, where each folds 
    acts as a validation set once, and the rest is used for training

2. Traing the model K times, where each fold helds different
    training and validation datasets.

3. Evaluate model's performance using evaluation metrics such as RMSE. <br>

4. Compute the average performance metric accross the folds


source:
1. https://medium.com/the-owl/k-fold-cross-validation-in-keras-3ec4a3a00538

In [84]:
import numpy as np

dataset = np.array([[10], [9], [8], [7], [6], [5], [4], [3], [2], [1]])
np.random.shuffle(dataset)


train_test_split_index = int(0.8 * len(dataset))  
train_set = dataset[:train_test_split_index]
test_set = dataset[train_test_split_index:]

k = 5
samples_per_fold = len(train_set) // k

for i in range(k):
    start_index_val = i * samples_per_fold
    end_index_val = (i + 1) * samples_per_fold

    validation_set = train_set[start_index_val:end_index_val]

    
    training_set = np.concatenate((train_set[:start_index_val], train_set[end_index_val:]))

    print(f'Fold {i+1}:')
    print("Training set:", training_set.flatten())
    print("Validation set:", validation_set.flatten())
    print()


print("Test set :", test_set.flatten())

Fold 1:
Training set: [ 7  3  2  8  1 10  6]
Validation set: [4]

Fold 2:
Training set: [ 4  3  2  8  1 10  6]
Validation set: [7]

Fold 3:
Training set: [ 4  7  2  8  1 10  6]
Validation set: [3]

Fold 4:
Training set: [ 4  7  3  8  1 10  6]
Validation set: [2]

Fold 5:
Training set: [ 4  7  3  2  1 10  6]
Validation set: [8]

Test set : [9 5]


In [85]:
import numpy as np
from sklearn.model_selection import KFold

dataset = np.array([[10], [9], [8], [7], [6], [5], [4], [3], [2], [1]])

k = 5
kf = KFold(n_splits=k)
validation_outputs = []


for train_index, test_index in kf.split(dataset):
    
    X_train, X_test = dataset[train_index], dataset[test_index]
    validation_output = X_test.mean()  
    validation_outputs.append(validation_output)


average_output = np.mean(validation_outputs)
print("Average output of 5-fold cross-validation:", average_output)


Average output of 5-fold cross-validation: 5.5


If we are using datasets from number 1:

In [101]:
from sklearn.model_selection import KFold
data = pd.read_csv('home.txt', names = ['size', 'bedroom', 'price'])
X = data[['size', 'bedroom']]
y = data['price']

k = 5
kf = KFold(n_splits=k)
validation_scores = []

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    score = r2_score(y_pred, y_test)
    validation_scores.append(score)


average_score = np.mean(validation_scores)
print("Average validation score of 5-fold cross-validation:", average_score)


Average validation score of 5-fold cross-validation: 0.5271896772950105
