In [1]:
# imports
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter
import copy

In [2]:
class Config:
    machine_data_path = Path('./machine.csv')
    titanic_data_path = Path('./Titanic.csv')

In [3]:
class HelperFunctions:
    @staticmethod
    def mean(x):
        n = x.shape[0]
        return x.sum() / n
    
    @staticmethod
    def std(x):
        n = x.shape[0]
        x_mean = HelperFunctions.mean(x)
        return np.sqrt(1/n*(((x-x_mean)**2).sum()))
    
    @staticmethod
    def covariance(x, y):
        n = x.shape[0]
        xy_mean = np.multiply(x, y).sum() / n
        x_mean = HelperFunctions.mean(x)
        y_mean = HelperFunctions.mean(y)
        return xy_mean - x_mean*y_mean
    
    @staticmethod
    def correlation(x, y):
        corr = 0
        corr += HelperFunctions.covariance(x, y) 
        corr /= HelperFunctions.std(x)
        corr /= HelperFunctions.std(y)
        return corr
    
    @staticmethod
    def correlation_matrix(x):
        n = x.shape[1]
        cm = [[0 for i in range(n)] for i in range(n)]
        for i in range(n):
            for j in range(n):
                cm[i][j] = HelperFunctions.correlation(x[:, i], x[:, j]) 
        return cm
    
    @staticmethod
    def conv_sum(x, y):
        return (x*y).sum(axis=1)

# Problem 1

In [4]:
class MultipleLinearRegression:
    def __init__(self, X, y, lr=1e-3, iterations=1000):
        self.X = X
        self.y = y
        self.feat_cnt = X.shape[1]
        self.theta = np.random.randn(1, self.feat_cnt+1)
        self.lr = lr
        self.iterations = iterations
        self.normalize()
        
    def normalize(self):
        min_x = np.min(self.X, axis=1).reshape(self.X.shape[0], 1)
        max_x = np.max(self.X, axis=1).reshape(self.X.shape[0], 1)
        self.X = (self.X - min_x) / (max_x - min_x)
        
    def predict(self, inp):
        inp = np.hstack((np.ones((inp.shape[0], 1)), inp))
        return HelperFunctions.conv_sum(self.theta, inp)
    
    def normalize_predict(self, inp):
        mn = np.min(inp, axis=1).reshape(inp.shape[0], 1)
        mx = np.max(inp, axis=1).reshape(inp.shape[0], 1)
        inp = (inp-mn) / (mx-mn)
        return self.predict(inp)
        
    def sgd(self):
        itr = tqdm(range(self.iterations))
        for _ in itr:
            for idx, feat in enumerate(self.X):
                feat = np.expand_dims(feat, axis=0)
                error = self.predict(feat) - self.y[idx]
                grad = np.matmul(error,
                                np.hstack((np.ones((1, 1)), feat)))
                self.theta = self.theta - grad*self.lr
            itr.set_description(f'Error: {error}')

In [5]:
machine_df = pd.read_csv(Config.machine_data_path)
machine_df.head()

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
0,adviser,32/60,125,256,6000,256,16,128,198
1,amdahl,470v/7,29,8000,32000,32,8,32,269
2,amdahl,470v/7a,29,8000,32000,32,8,32,220
3,amdahl,470v/7b,29,8000,32000,32,8,32,172
4,amdahl,470v/7c,29,8000,16000,32,8,16,132


In [6]:
train_df, test_df = train_test_split(machine_df, test_size=0.2, shuffle=True)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [7]:
train_X = train_df.drop(columns=['PRP', 
                             'vendor name', 
                             'Model Name', 
                             'MYCT', 
                             'CHMIN', 
                             'CHMAX']).to_numpy()
train_y = train_df['PRP'].to_numpy()
test_X = test_df.drop(columns=['PRP', 
                             'vendor name', 
                             'Model Name', 
                             'MYCT', 
                             'CHMIN', 
                             'CHMAX']).to_numpy()
test_y = test_df['PRP'].to_numpy()

In [8]:
mlr = MultipleLinearRegression(train_X, train_y, lr=1e-3, iterations=200)
mlr.sgd()

Error: [30.42059547]: 100%|█████████████████████████████████████████████| 200/200 [00:02<00:00, 93.71it/s]


In [9]:
preds = mlr.normalize_predict(test_X)

In [10]:
def ftest(pred, targ, p):
    targ_mean = targ.mean(axis=0)
    n = pred.shape[0]
    
    msm = (pred-targ_mean).sum() / (p-1)
    mse = ((targ-pred)**2).sum() / (n-p)

    return msm / mse

In [11]:
f_test = ftest(preds, test_y, mlr.feat_cnt+1)
print(f'f-test: {f_test}')

f-test: -0.0069339569321297196


# Problem 2

In [12]:
tennis_datadict = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

tennis_df = pd.DataFrame(tennis_datadict)
tennis_df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [13]:
def label_encoding(df):
    df_copy = copy.deepcopy(df)
    les = dict()
    for col in df_copy.columns:
        if df_copy.dtypes[col] == 'object':
            rng = df_copy[col].unique()
            le = dict()
            for idx, val in enumerate(rng):
                le[val] = idx
            df_copy[col] = [le[i] for i in df_copy[col]]
            les[col] = le
    return df_copy, les

In [14]:
class DecisionTree:
    def __init__(self, df, targ_column):
        self.df = df
        self.le = label_encoding(self.df)
        self.feat_cnt = len(df.columns) - 1
        self.targ_column = targ_column

    def eval_entropy(self, d):
        cntr = Counter(d)
        total = 0
        for i in cntr.values():
            total += i
        entropy = 0
        for i in cntr.values():
            p = i/total
            entropy += (-1*p*np.log2(p))
        return entropy
    
    class DecisionTreeNode:
        def __init__(self, curr_column, nxt, leaf_node, pred):
            self.curr_column = curr_column
            self.nxt = nxt
            self.leaf_node = leaf_node
            self.pred = pred
    
    def build(self):
        considered = dict()
        for i in self.df.columns:
            if i != self.targ_column:
                considered[i] = False
        root = self.dfs(self.df, considered)
        return root
        
    def dfs(self, df, considered):
        mx_gain = 0.0
        mx_col = None
        # entropy of the passed df
        s = self.eval_entropy(df[self.targ_column])
        cnt = len(df)
        for col in df.columns:
            if col!=self.targ_column and not considered[col]:
                entropy_col = 0
                for val in df[col].unique():
                    val_df = df[df[col]==val]
                    p =  len(val_df) / cnt
                    entropy_col += p*self.eval_entropy(val_df[self.targ_column])
                # entropy gain
                g = s - entropy_col
                if g >= mx_gain:
                    mx_gain = g
                    mx_col = col
        tn = None
        nxt = dict()
        if mx_col is None or mx_gain == 0.0:
            mx_occr_targ = df[self.targ_column].mode()[0]
            tn = self.DecisionTreeNode(None, nxt, True, mx_occr_targ)
        else:
            considered[mx_col] = True
            for i in df[mx_col].unique():
                nxt[i] = self.dfs(df[df[mx_col]==i], considered)
            tn = self.DecisionTreeNode(mx_col, nxt, False, None)
            considered[mx_col] = False
        return tn

In [15]:
dt = DecisionTree(tennis_df, 'PlayTennis')
root = dt.build()

In [16]:
def display_tree(root, depth=0):
    s = '\t'*depth
    if root.leaf_node:
        print(f'{s}Prediction: {root.pred}')
        print()
        return
    for i in root.nxt.items():
        print(f'{s}{root.curr_column}', end=' ')
        print(f'= {i[0]}')
        display_tree(i[1], depth+1)

In [17]:
display_tree(root)

Outlook = Sunny
	Humidity = High
		Prediction: No

	Humidity = Normal
		Prediction: Yes

Outlook = Overcast
	Prediction: Yes

Outlook = Rain
	Wind = Weak
		Prediction: Yes

	Wind = Strong
		Prediction: No



# Problem 3

In [18]:
titanic_df = pd.read_csv(Config.titanic_data_path)
titanic_df.head()

Unnamed: 0,Class,Age,Sex,Survived
0,First,Adult,Male,Yes
1,First,Adult,Male,Yes
2,First,Adult,Male,Yes
3,First,Adult,Male,Yes
4,First,Adult,Male,Yes


In [19]:
dt = DecisionTree(titanic_df, 'Survived')
root = dt.build()

In [20]:
display_tree(root)

Sex = Male
	Class = First
		Age = Adult
			Prediction: No

		Age = Child
			Prediction: Yes

	Class = Second
		Age = Adult
			Prediction: No

		Age = Child
			Prediction: Yes

	Class = Third
		Age = Adult
			Prediction: No

		Age = Child
			Prediction: No

	Class = Crew
		Prediction: No

Sex = Female
	Class = First
		Age = Adult
			Prediction: Yes

		Age = Child
			Prediction: Yes

	Class = Second
		Age = Adult
			Prediction: Yes

		Age = Child
			Prediction: Yes

	Class = Third
		Age = Adult
			Prediction: No

		Age = Child
			Prediction: No

	Class = Crew
		Prediction: Yes

