In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [61]:
#Helper Functions
class Functions:
    
    ## function for random sampling of data
    def random_sampler(self, perc=0.1):
        prop = int(len(self.X)*perc) #proportion of the data to be used for each iteration, here we have set it to 10% of the data
        ind = np.random.choice(range(len(self.X)), prop, replace=False)
        test_ind = list(set(range(len(self.X))) - set(ind)) #for using the rest sample as test set
        return self.X[ind],self.y[ind], self.X[test_ind], self.y[test_ind]
    
    ## funtion for normalizing the features
    def normalize_features(self,X,append=True):
        X = (X - np.mean(X, 0)) / np.std(X, 0) #normalize the features
        if append:
            X = np.append(np.ones(X.shape[0]).reshape(-1,1),X,1) #append column of ones for intercept
        return X

In [69]:
import numpy as np

class Linear_Regression():
    def __init__(self, X, y, iterations=100, alpha=0.01):
        self.X = X
        self.y = y.reshape(-1, 1)
        self.iterations = iterations
        self.alpha = alpha
        self.theta = np.zeros((self.X.shape[1], 1))

    def fit(self):
        for _ in range(self.iterations):
            gradient = -(2/len(self.X)) * np.dot(self.X.T, (self.y - np.dot(self.X, self.theta)))
            self.theta = self.theta - self.alpha * gradient
        return self.theta

    def predict(self, X):
        y_hat = np.dot(X, self.theta)
        return y_hat

    # def loss(self):
    #     pred = np.dot(self.X, self.theta)
    #     return (np.square(self.y - pred)).mean()
    def loss(self,y,yhat):
       # y_pred_test = self.predict(X_test)
        loss_value = np.mean((y - yhat) ** 2)  # Mean Squared Error
        squared_diff = (y - yhat) ** 2
       # print(squared_diff)
        return loss_value


In [70]:
#Reading the data and splitting it into training and testing set
reg_data = pd.read_csv('regression.csv')
reg_data.head(3)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Y
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [34]:
reg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      1599 non-null   float64
 1   X2      1599 non-null   float64
 2   X3      1599 non-null   float64
 3   X4      1599 non-null   float64
 4   X5      1599 non-null   float64
 5   X6      1599 non-null   float64
 6   X7      1599 non-null   float64
 7   X8      1599 non-null   float64
 8   X9      1599 non-null   float64
 9   X10     1599 non-null   float64
 10  X11     1599 non-null   float64
 11  Y       1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [17]:
X,y = reg_data.iloc[:,:-1],reg_data.iloc[:,-1]
#let's split the data into a training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
#to check correlation of the features with target in training data
train_df = pd.concat([X_train,y_train],axis=1)
plt.figure(figsize=(12, 8))
corr = train_df.corr()
#to remove least 3 correlated features
least_corr = corr.iloc[:,-1].abs().nsmallest(3).index
X_train = X_train.drop(least_corr, axis=1)
X_test = X_test.drop(least_corr, axis=1)


#normalizing data 
X_train_normalized = Functions().normalize_features(X_train.values)
X_test_normalized = Functions().normalize_features(X_test.values)

<Figure size 1200x800 with 0 Axes>

In [72]:
#Fit the model
lin_reg_model=Linear_Regression(X_train_normalized,y_train.values)
lin_reg_model_params = lin_reg_model.fit()
linreg_y_hat=lin_reg_model.predict(X_test_normalized)

In [63]:
type(linreg_y_hat) ,"" ,type(y_test)

(numpy.ndarray, '', pandas.core.series.Series)

In [64]:
a=((linreg_y_hat.flatten() ))

In [65]:
b=((y_test.values))

In [74]:
lin_reg_model.loss(a, b)


1.052920456396039

In [67]:
#a

In [68]:
#b

In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore")

class Optimization:
    def __init__(self, X, y, iterations=100, alpha=0.01):
        self.X = X
        self.y = y.reshape(-1, 1)
        self.iterations = iterations
        self.alpha = alpha
        self.theta = np.zeros((self.X.shape[1], 1))

    def gradient_descent(self):
        for _ in range(self.iterations):
            gradient = -(2/len(self.X)) * np.dot(self.X.T, (self.y - np.dot(self.X, self.theta)))
            self.theta = self.theta - self.alpha * gradient
        return self.theta


class Loss:
    @staticmethod
    def mean_squared_error(y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)


class LinearRegression:
    def __init__(self):
        self.optimization = None

    def fit(self, X_train, y_train, iterations=100, alpha=0.01):
        y_train = np.array(y_train)  # Convert y_train to a NumPy array
        self.optimization = Optimization(X_train, y_train, iterations, alpha)
        self.optimization.gradient_descent()

    def predict(self, X):
        if self.optimization is None:
            raise ValueError("Model not fitted yet. Call fit method before predict.")
        return np.dot(X, self.optimization.theta)

    def evaluate(self, y_true, y_pred):
        return Loss.mean_squared_error(y_true, y_pred)


# Reading the data
reg_data = pd.read_csv('regression.csv')
X, y = reg_data.iloc[:, :-1], reg_data.iloc[:, -1]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking correlation of the features with target in training data
train_df = pd.concat([X_train, y_train], axis=1)
plt.figure(figsize=(12, 8))
corr = train_df.corr()

# Removing least 3 correlated features
least_corr = corr.iloc[:, -1].abs().nsmallest(3).index
X_train = X_train.drop(least_corr, axis=1)
X_test = X_test.drop(least_corr, axis=1)
#normalizing data 
X_train_normalized = Functions().normalize_features(X_train.values)
X_test_normalized = Functions().normalize_features(X_test.values)
# Fitting the model
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train_normalized, y_train)
linreg_y_hat = lin_reg_model.predict(X_test_normalized)
loss_value = lin_reg_model.evaluate(y_test.values, linreg_y_hat.flatten())
print("Mean Squared Error:", loss_value)


Mean Squared Error: 1.046048563430105


<Figure size 1200x800 with 0 Axes>