<a href="https://colab.research.google.com/github/MEnisSen/ElasticNet-50-Startups/blob/main/50_Startups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Muhammed Enis Şen**

# **Import Data from Kaggle**

In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

mkdir: cannot create directory ‘/root/.kaggle’: File exists
ref                                                            title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
-------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
meirnizri/covid19-dataset                                      COVID-19 Dataset                                      5MB  2022-11-13 15:47:17           8400        252  1.0              
mattop/alcohol-consumption-per-capita-2016                     Alcohol Consumption Per Capita 2016                   4KB  2022-12-09 00:03:11            726         27  1.0              
thedevastator/jobs-dataset-from-glassdoor                      Salary Prediction                                     3MB  2022-11-16 13:52:31           5370        117  1.0              
swapt

In [None]:
! kaggle datasets download -d karthickveerakumar/startup-logistic-regression

startup-logistic-regression.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
! unzip startup-logistic-regression.zip

Archive:  startup-logistic-regression.zip
replace 50_Startups.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.utils import shuffle

In [None]:
df = pd.read_csv('50_Startups.csv')

In [None]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
df = df.drop('State', axis='columns')
df = shuffle(df, random_state=1)

In [None]:
data = df.to_numpy()
features = data[:,:3]
labels = data[:,3].reshape(-1,1)

x_train = features[:30]
x_test = features[30:]
y_train = labels[:30]
y_test = labels[30:]

# **Elastic-Net Regression**

In [None]:
def eNet(A, b, x, l1, l2):
  cost = 0.5 * np.linalg.norm(A.dot(x) - b, ord=2) ** 2 + l1 * np.linalg.norm(x, ord=1) + l2 / 2 * np.linalg.norm(x, ord=2) ** 2
  return cost

def eNet_grad(A, b, x, l1, l2):
  grad = A.T.dot(A.dot(x) - b) + l1 * np.sign(x) + 2 * l2 * x
  return grad

def train_fnc(x_train, y_train, w, l1, l2, max_iter):
  for _ in range(max_iter):
    w -= learning_rate * eNet_grad(x_train, y_train, w, l1, l2)
  return w

def MSE(A, b, x):
  mse = np.mean((np.dot(A, x) - b) ** 2)
  return mse

In [None]:
learning_rate = 0.1
l1s = [10**(-i) for i in range(1,6)]
l2s = [10**(-i) for i in range(1,6)]
mse_results = []
max_iter = 10
kfold_size = 5

In [None]:
train_data = np.concatenate((features[:30], labels[:30]), axis=1)
#train_data = np.random.shuffle(train_data)
number_of_train_samples = 30

for l1 in l1s:
  for l2 in l2s:
    mse_sub = []

    for i in range(kfold_size):
      w = np.zeros((features.shape[1],1))
      #w = np.random.rand(x_train.shape[1],1)

      start_ind = int(i * number_of_train_samples / kfold_size)
      stop_ind = int((i+1) * number_of_train_samples / kfold_size)

      train_data_temp = np.concatenate((train_data[:start_ind], train_data[stop_ind:]), axis=0)
      val_data_temp = train_data[start_ind:stop_ind]

      x_train = train_data_temp[:,:-1]
      y_train = train_data_temp[:,-1].reshape(-1,1)

      x_val = val_data_temp[:,:-1]
      y_val = val_data_temp[:,-1]

      w = train_fnc(x_train, y_train, w, l1, l2, max_iter)
      
      mse = MSE(x_val, y_val, w)
      mse_sub.append(mse)
    mse_results.append(np.array(mse_sub).mean())

In [None]:
def mse_score_manage(mse_results, num_l1, num_l2):
  mse_results = np.array(mse_results).reshape(num_l1, num_l2)
  indices = np.where(mse_results == mse_results.min())
  best_l1 = int(indices[0][0])
  best_l2 = int(indices[1][0])
  return {'l1': best_l1, 'l2': best_l2}

best_results = mse_score_manage(mse_results, len(l1s), len(l2s))
w = np.zeros((x_train.shape[1],1))

best_w = train_fnc(features[:30], labels[:30], w, l1s[best_results['l1']], l2s[best_results['l2']], max_iter)

eNet_mse = MSE(x_test, y_test, best_w)
print("Best MSE score for Elastic-Net using the best l1 and l2 parameters :", eNet_mse,
      "\nBest l1 parameter was", l1s[best_results['l1']], "and best l2 parameter was", l2s[best_results['l2']])

Best MSE score for Elastic-Net using the best l1 and l2 parameters : 1.724305430061938e+236 
Best l1 parameter was 0.1 and best l2 parameter was 1e-05


# **Ordinary Multi-Linear Regression**

In [None]:
def ordinary_mult_lin_reg(A, b, x):
  cost = 0.5 * np.linalg.norm(A.dot(x) - b, ord=2) ** 2
  return cost

def ordinary_mult_lin_reg_grad(A, b, x):
  grad = A.T.dot(A.dot(x) - b)
  return grad

x_train = features[:30]
y_train = labels[:30]
w_new = np.zeros((x_train.shape[1],1))

for _ in range(max_iter):
  w_new -= learning_rate * ordinary_mult_lin_reg_grad(x_train, y_train, w_new)

In [None]:
ord_mse = MSE(x_test, y_test, w_new)
print("MSE for ordinary multi-linear regression :", ord_mse)

MSE for ordinary multi-linear regression : 1.724305430061938e+236
