In [1]:
# imports
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from scipy import stats

In [2]:
class Config:
    data_path = Path('./machine.csv')

In [3]:
class HelperFunctions:
    @staticmethod
    def mean(x):
        n = x.shape[0]
        return x.sum() / n
    
    @staticmethod
    def std(x):
        n = x.shape[0]
        x_mean = HelperFunctions.mean(x)
        return np.sqrt(1/n*(((x-x_mean)**2).sum()))
    
    @staticmethod
    def covariance(x, y):
        n = x.shape[0]
        xy_mean = np.multiply(x, y).sum() / n
        x_mean = HelperFunctions.mean(x)
        y_mean = HelperFunctions.mean(y)
        return xy_mean - x_mean*y_mean
    
    @staticmethod
    def correlation(x, y):
        corr = 0
        corr += HelperFunctions.covariance(x, y) 
        corr /= HelperFunctions.std(x)
        corr /= HelperFunctions.std(y)
        return corr
    
    @staticmethod
    def correlation_matrix(x):
        n = x.shape[1]
        cm = [[0 for i in range(n)] for i in range(n)]
        for i in range(n):
            for j in range(n):
                cm[i][j] = HelperFunctions.correlation(x[:, i], x[:, j]) 
        return cm
    
    @staticmethod
    def conv_sum(x, y):
        return (x*y).sum(axis=1)

In [4]:
machine_df = pd.read_csv(Config.data_path)
machine_df.head()

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
0,adviser,32/60,125,256,6000,256,16,128,198
1,amdahl,470v/7,29,8000,32000,32,8,32,269
2,amdahl,470v/7a,29,8000,32000,32,8,32,220
3,amdahl,470v/7b,29,8000,32000,32,8,32,172
4,amdahl,470v/7c,29,8000,16000,32,8,16,132


In [5]:
class MultipleLinearRegression:
    def __init__(self, X, y, lr=1e-3, iterations=1000):
        self.X = X
        self.y = y
        self.feat_cnt = X.shape[1]
        self.theta = np.random.randn(1, self.feat_cnt+1)
        self.lr = lr
        self.iterations = iterations
        self.normalize()
        
    def normalize(self):
        min_x = np.min(self.X, axis=1).reshape(self.X.shape[0], 1)
        max_x = np.max(self.X, axis=1).reshape(self.X.shape[0], 1)
        self.X = (self.X - min_x) / (max_x - min_x)
        
    def predict(self, inp):
        inp = np.hstack((np.ones((inp.shape[0], 1)), inp))
        return HelperFunctions.conv_sum(self.theta, inp)
    
    def normalize_predict(self, inp):
        mn = np.min(inp, axis=1).reshape(inp.shape[0], 1)
        mx = np.max(inp, axis=1).reshape(inp.shape[0], 1)
        inp = (inp-mn) / (mx-mn)
        return self.predict(inp)
        
    def sgd(self):
        itr = tqdm(range(self.iterations))
        for _ in itr:
            for idx, feat in enumerate(self.X):
                feat = np.expand_dims(feat, axis=0)
                error = self.predict(feat) - self.y[idx]
                grad = np.matmul(error,
                                np.hstack((np.ones((1, 1)), feat)))
                self.theta = self.theta - grad*self.lr
            itr.set_description(f'Error: {error}')

In [6]:
X = machine_df.drop(columns=['PRP', 'vendor name', 'Model Name']).to_numpy()
y = machine_df['PRP'].to_numpy()

# Problem 1

In [7]:
# correlation matrix among feature variables
cm = HelperFunctions.correlation_matrix(X)
cm

[[1.0,
  -0.335642195508513,
  -0.3785606178715616,
  -0.32099978976972116,
  -0.3010897113958365,
  -0.25050229252073214],
 [-0.335642195508513,
  1.0,
  0.7581573478037231,
  0.5347290904835281,
  0.5171892214181076,
  0.2669074311252619],
 [-0.3785606178715616,
  0.7581573478037232,
  1.0000000000000002,
  0.5379898185263052,
  0.5605134214806345,
  0.5272461816383357],
 [-0.32099978976972116,
  0.5347290904835281,
  0.5379898185263051,
  1.0000000000000004,
  0.5822454590800031,
  0.48784576802832064],
 [-0.3010897113958365,
  0.5171892214181076,
  0.5605134214806345,
  0.5822454590800031,
  1.0,
  0.5482812070286769],
 [-0.2505022925207322,
  0.2669074311252619,
  0.5272461816383357,
  0.4878457680283206,
  0.5482812070286769,
  1.0]]

In [8]:
# correlation with dependent variable
print(f'Correlation with PRP: ')
for i in machine_df.columns:
    if i not in ['PRP', 'vendor name', 'Model Name']:
        corr = HelperFunctions.correlation(machine_df[i].to_numpy(),
                                          machine_df['PRP'].to_numpy())
        print(f'{i}: {corr}')

Correlation with PRP: 
MYCT: -0.3070994470957124
MMIN: 0.7949313405266915
MMAX: 0.8630041243651341
CACH: 0.6626414266783193
CHMIN: 0.6089032834114069
CHMAX: 0.605209292812674


Considering threshold at 0.65 we pick MMIN, MMAX, CACH as our features

# Problem 2

In [9]:
train_df, test_df = train_test_split(machine_df, test_size=0.2, shuffle=True)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [10]:
# data
train_X = train_df.drop(columns=['PRP', 
                             'vendor name', 
                             'Model Name', 
                             'MYCT', 
                             'CHMIN', 
                             'CHMAX']).to_numpy()
train_y = train_df['PRP'].to_numpy()

In [11]:
mlr = MultipleLinearRegression(train_X, train_y, lr=1e-4, iterations=1000)

In [12]:
mlr.sgd()

Error: [64.08017039]: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:06<00:00, 162.88it/s]


# Problem 3

In [13]:
test_X = test_df.drop(columns=['PRP', 
                             'vendor name', 
                             'Model Name', 
                             'MYCT', 
                             'CHMIN', 
                             'CHMAX']).to_numpy()
test_y = test_df['PRP'].to_numpy()

In [14]:
preds = mlr.normalize_predict(test_X)
preds.shape

(42,)

In [15]:
def ttest(X,Y):
    for i in range(X.shape[1]):
        print(stats.ttest_ind(X[:, i], Y))

In [16]:
ttest(train_X, train_y)

Ttest_indResult(statistic=9.090526852064407, pvalue=9.164735772412934e-18)
Ttest_indResult(statistic=12.290909551211156, pvalue=7.15204732357762e-29)
Ttest_indResult(statistic=-6.092548194587914, pvalue=3.077211436671297e-09)


# Problem 4

In [17]:
def evaluate_r2(y, y_pred):
    y_mean = np.mean(y)

    sse = np.sum((y-y_pred)**2)
    ssr = np.sum((y_pred-y_mean)**2)

    sst = ssr + sse

    r2 = ssr/sst
    return r2

In [18]:
evaluate_r2(test_y, preds)

0.12785593327407194