In [1]:
# imports
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
class Config:
    data_path = Path('./machine.csv')

In [13]:
class HelperFunctions:
    @staticmethod
    def mean(x):
        n = x.shape[0]
        return x.sum() / n
    
    @staticmethod
    def std(x):
        n = x.shape[0]
        x_mean = HelperFunctions.mean(x)
        return np.sqrt(1/n*(((x-x_mean)**2).sum()))
    
    @staticmethod
    def covariance(x, y):
        n = x.shape[0]
        xy_mean = np.multiply(x, y).sum() / n
        x_mean = HelperFunctions.mean(x)
        y_mean = HelperFunctions.mean(y)
        return xy_mean - x_mean*y_mean
    
    @staticmethod
    def correlation(x, y):
        corr = 0
        corr += HelperFunctions.covariance(x, y) 
        corr /= HelperFunctions.std(x)
        corr /= HelperFunctions.std(y)
        return corr
    
    @staticmethod
    def correlation_matrix(x):
        n = x.shape[1]
        cm = [[0 for i in range(n)] for i in range(n)]
        for i in range(n):
            for j in range(n):
                cm[i][j] = HelperFunctions.correlation(x[:, i], x[:, j]) 
        return cm
    
    @staticmethod
    def conv_sum(x, y):
        return (x*y).sum()

In [4]:
machine_df = pd.read_csv(Config.data_path)
machine_df.head()

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
0,adviser,32/60,125,256,6000,256,16,128,198
1,amdahl,470v/7,29,8000,32000,32,8,32,269
2,amdahl,470v/7a,29,8000,32000,32,8,32,220
3,amdahl,470v/7b,29,8000,32000,32,8,32,172
4,amdahl,470v/7c,29,8000,16000,32,8,16,132


In [5]:
class MultipleLinearRegression:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.feat_cnt = X.shape[1]
        self.theta = np.random.normal(0, 1, size=(1, self.feat_cnt+1))
        
    def predict(self, inp):
        inp = np.hstack((np.zeroes((1, 1)), inp))
        return HelperFunctions.conv_sum(self.theta, inp)
        
    def error(self, inp, targ):
        pred = self.predict(inp)
        return 1/2 * ((self.inp - targ)**2).sum()
        
    def grad(self):
        
        
    def sgd(self):
        

In [6]:
X = machine_df.drop(columns=['PRP', 'vendor name', 'Model Name']).to_numpy()
y = machine_df['PRP'].to_numpy()

# Problem 1

In [8]:
# correlation matrix among feature variables
cm = HelperFunctions.correlation_matrix(X)
cm

[[1.0,
  -0.335642195508513,
  -0.3785606178715616,
  -0.32099978976972116,
  -0.3010897113958365,
  -0.25050229252073214],
 [-0.335642195508513,
  1.0,
  0.7581573478037231,
  0.5347290904835281,
  0.5171892214181076,
  0.2669074311252619],
 [-0.3785606178715616,
  0.7581573478037232,
  1.0000000000000002,
  0.5379898185263052,
  0.5605134214806345,
  0.5272461816383357],
 [-0.32099978976972116,
  0.5347290904835281,
  0.5379898185263051,
  1.0000000000000004,
  0.5822454590800031,
  0.48784576802832064],
 [-0.3010897113958365,
  0.5171892214181076,
  0.5605134214806345,
  0.5822454590800031,
  1.0,
  0.5482812070286769],
 [-0.2505022925207322,
  0.2669074311252619,
  0.5272461816383357,
  0.4878457680283206,
  0.5482812070286769,
  1.0]]

In [10]:
# correlation with dependent variable
print(f'Correlation with PRP: ')
for i in machine_df.columns:
    if i not in ['PRP', 'vendor name', 'Model Name']:
        corr = HelperFunctions.correlation(machine_df[i].to_numpy(),
                                          machine_df['PRP'].to_numpy())
        print(f'{i}: {corr}')

Correlation with PRP: 
MYCT: -0.3070994470957124
MMIN: 0.7949313405266915
MMAX: 0.8630041243651341
CACH: 0.6626414266783193
CHMIN: 0.6089032834114069
CHMAX: 0.605209292812674


Considering threshold at 0.65 we pick MMIN, MMAX, CACH as our features

# Problem 2

In [11]:
# data
X = machine_df.drop(columns=['PRP', 
                             'vendor name', 
                             'Model Name', 
                             'MYCT', 
                             'CHMIN', 
                             'CHMAX']).to_numpy()
y = machine_df['PRP'].to_numpy()