# Python Project Template
## 1. Prepare Problem
### a) Load libraries
### b) Load dataset
## 2. Summarize Data
### a) Descriptive statistics
### b) Data visualizations
## 3. Prepare Data
### a) Data Cleaning
### b) Feature Selection
### c) Data Transforms
## 4. Evaluate Algorithms
### a) Split-out validation dataset
### b) Test options and evaluation metric
### c) Spot Check Algorithms
### d) Compare Algorithms
## 5. Improve Accuracy
### a) Algorithm Tuning
### b) Ensembles
## 6. Finalize Model
### a) Predictions on validation dataset
### b) Create standalone model on entire training dataset
### c) Save model for later use

# Understand the problem

* https://archive.ics.uci.edu/dataset/29/computer+hardware

# Process zip file

In [17]:
%%bash 
pwd
cd ./Data
mkdir ComputerHardware
cd ComputerHardware
wget https://archive.ics.uci.edu/static/public/29/computer+hardware.zip && echo "Download complete"

/makeenv2_repo/repo_v1/Day02


mkdir: cannot create directory ‘ComputerHardware’: File exists


--2025-05-01 14:30:55--  https://archive.ics.uci.edu/static/public/29/computer+hardware.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252, 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘computer+hardware.zip’

     0K ....                                                   44.6M=0s

2025-05-01 14:30:57 (44.6 MB/s) - ‘computer+hardware.zip’ saved [4482]



Download complete


In [18]:
import zipfile

In [25]:
zipfile.ZipFile("./Data/ComputerHardware/computer+hardware.zip", "r").extractall("./Data/ComputerHardware/")

In [27]:
! ls -l ./Data/ComputerHardware/

total 28
-rw-r--r-- 1 root root  122 May  1 14:31 Index
-rw-r--r-- 1 root root 4482 May  1 14:30 computer+hardware.zip
-rw-r--r-- 1 root root 8726 May  1 14:31 machine.data
-rw-r--r-- 1 root root 2903 May  1 14:31 machine.names


# Load libraries

In [28]:
# 
import numpy as np
import pandas as pd

# Visulization 
from matplotlib import pyplot

# data modeling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# pipeline setup
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# metrics 
from sklearn.metrics import mean_squared_error

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# Load The Data

In [29]:
!pwd

/makeenv2_repo/repo_v1/Day02


In [30]:
# Load dataset
filename = "./Data/ComputerHardware/machine.data"
names = "./Data/ComputerHardware/machine.names"
names = ["vendor name", "Model Name", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP", "ERP"]
dataset = pd.read_csv(filename, names=names)

In [31]:
dataset.head(2)

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253


# Explore the data (EDA)

In [32]:
dataset.head()

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


In [33]:
dataset.shape

(209, 10)

In [34]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   vendor name  209 non-null    object
 1   Model Name   209 non-null    object
 2   MYCT         209 non-null    int64 
 3   MMIN         209 non-null    int64 
 4   MMAX         209 non-null    int64 
 5   CACH         209 non-null    int64 
 6   CHMIN        209 non-null    int64 
 7   CHMAX        209 non-null    int64 
 8   PRP          209 non-null    int64 
 9   ERP          209 non-null    int64 
dtypes: int64(8), object(2)
memory usage: 16.5+ KB


In [35]:
dataset.describe()

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
count,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0
mean,203.822967,2867.980861,11796.15311,25.205742,4.698565,18.267943,105.62201,99.330144
std,260.262926,3878.742758,11726.564377,40.628722,6.816274,25.997318,160.830733,154.757102
min,17.0,64.0,64.0,0.0,0.0,0.0,6.0,15.0
25%,50.0,768.0,4000.0,0.0,1.0,5.0,27.0,28.0
50%,110.0,2000.0,8000.0,8.0,2.0,8.0,50.0,45.0
75%,225.0,4000.0,16000.0,32.0,6.0,24.0,113.0,101.0
max,1500.0,32000.0,64000.0,256.0,52.0,176.0,1150.0,1238.0


In [36]:
dataset.dtypes

vendor name    object
Model Name     object
MYCT            int64
MMIN            int64
MMAX            int64
CACH            int64
CHMIN           int64
CHMAX           int64
PRP             int64
ERP             int64
dtype: object

In [37]:
# define dependant and independant columns 
input_col = ["MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX"]
target_col = ["PRP"]

### <font color='green'> Exercise: do more EDA </font>

# Experiment (1):

* Try to drop obj columns
* Without pipeline
* Assume linear problem

In [38]:
dataset_1 = dataset.copy()

In [39]:
dataset_1 = dataset_1.drop(["vendor name", "Model Name"], axis=1)

In [40]:
dataset_1 = dataset_1.astype(float)
dataset_1.dtypes

MYCT     float64
MMIN     float64
MMAX     float64
CACH     float64
CHMIN    float64
CHMAX    float64
PRP      float64
ERP      float64
dtype: object

In [41]:
dataset_1.head()

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,125.0,256.0,6000.0,256.0,16.0,128.0,198.0,199.0
1,29.0,8000.0,32000.0,32.0,8.0,32.0,269.0,253.0
2,29.0,8000.0,32000.0,32.0,8.0,32.0,220.0,253.0
3,29.0,8000.0,32000.0,32.0,8.0,32.0,172.0,253.0
4,29.0,8000.0,16000.0,32.0,8.0,16.0,132.0,132.0


In [42]:
# split train/test data
dataset_1 = dataset_1.sample(frac=1).reset_index(drop=True)
X_train = dataset_1.iloc[:178][input_col]
y_train = dataset_1.iloc[:178][target_col]
X_test = dataset_1.iloc[178:][input_col]
y_test = dataset_1.iloc[178:][target_col]

In [43]:
assert (X_train.shape[0] + X_test.shape[0]) == dataset_1.shape[0]

In [44]:
ct = ColumnTransformer([
    ("preprocess_minmax", MinMaxScaler(), ["MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX"]),
], 
remainder="passthrough")

In [45]:
ct.fit(X_train)

In [46]:
X_train_trans = ct.transform(X_train)
X_test_trans = ct.transform(X_test)

In [47]:
# post transformation
pd.DataFrame(X_train_trans, columns=X_train.columns)

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX
0,0.082940,0.060621,0.499499,0.125000,0.019231,0.306818
1,0.022252,0.029309,0.124124,0.031250,0.057692,0.028409
2,0.190829,0.004008,0.011011,0.023438,0.115385,0.136364
3,0.006069,0.498998,0.499499,0.250000,0.153846,0.136364
4,0.066082,0.014028,0.092843,0.062500,0.019231,0.034091
...,...,...,...,...,...,...
173,0.006069,0.248497,0.499499,0.250000,0.230769,0.090909
174,0.113284,0.060621,0.249249,0.062500,0.019231,0.034091
175,0.022252,0.060621,0.124124,0.031250,0.057692,0.034091
176,0.527984,0.006012,0.124124,0.000000,0.019231,0.022727


### use Linear Regression (OLS closed form solution)

In [48]:
LR_model = LinearRegression()

In [49]:
LR_model.fit(X_train, y_train)

In [50]:
LR_model.coef_, LR_model.intercept_

(array([[ 0.04976509,  0.01485933,  0.00606101,  0.60494655, -0.34230951,
          1.50020832]]),
 array([-58.46110115]))

In [51]:
LR_model.score(X_train, y_train)

0.8654651566521094

In [52]:
LR_model.score(X_test, y_test)

0.8428785764945429

In [53]:
mean_squared_error(LR_model.predict(X_train), y_train)

3802.927974260984

In [54]:
y_hat = LR_model.predict(X_test)

In [55]:
mean_squared_error(y_hat, y_test)

1735.9857224578639

### Use Gradient based learning

In [56]:
SGD_model = SGDRegressor(max_iter=1000000)

In [57]:
SGD_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [58]:
score = SGD_model.score(X_train, y_train)
score

-5.324470294221993e+28

In [59]:
f"{score:0.1f}"

'-53244702942219932270457782272.0'

In [60]:
mse = mean_squared_error(SGD_model.predict(X_train), y_train)

In [61]:
f"{mse:0.1f}"

'1505080507482964501088302090158080.0'

# Experiment 2

## Model selection: train/test split

In [62]:
train_set, test_set = train_test_split(dataset, test_size=0.2)

In [63]:
train_set.shape, test_set.shape

((167, 10), (42, 10))

## build pipeline

### Create custome transformer for dropping columns

In [64]:
class ColumnsDroper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop: list):
        self.columns_to_drop = columns_to_drop
    
    def drop_columns(self, X):
        X = X.drop(self.columns_to_drop, axis=1)
        return(X)
        

    def fit(self, X:pd.DataFrame, y = None):
        return self

    def transform(self, X, y = None):
        X = self.drop_columns(X)
        return(X)

In [65]:
# test custome transformer for dorpping columns
cust = ColumnsDroper(["vendor name", "Model Name"])
cust.fit_transform(dataset)

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,125,256,6000,256,16,128,198,199
1,29,8000,32000,32,8,32,269,253
2,29,8000,32000,32,8,32,220,253
3,29,8000,32000,32,8,32,172,253
4,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...
204,124,1000,8000,0,1,8,42,37
205,98,1000,8000,32,2,8,46,50
206,125,2000,8000,0,2,14,52,41
207,480,512,8000,32,0,0,67,47


In [66]:
pipe = Pipeline([("cust", ColumnsDroper(["vendor name", "Model Name"]))])
pipe.fit_transform(dataset)

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,125,256,6000,256,16,128,198,199
1,29,8000,32000,32,8,32,269,253
2,29,8000,32000,32,8,32,220,253
3,29,8000,32000,32,8,32,172,253
4,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...
204,124,1000,8000,0,1,8,42,37
205,98,1000,8000,32,2,8,46,50
206,125,2000,8000,0,2,14,52,41
207,480,512,8000,32,0,0,67,47


### Build pipeline

In [67]:
preprocess = Pipeline([
    ("DropColumns", ColumnsDroper(["vendor name", "Model Name"])),
    ("Standarzation", StandardScaler()),
    ("Add-Nonelinearity", PolynomialFeatures(degree=3))
])

## ML models 

In [68]:
models_pipeline = Pipeline([
    ("model_LR" ,LinearRegression())
])

## Execute the pipeline 

In [69]:
X_train = train_set[[c for c in dataset.columns if c not in ["PRP", "ERP"]]]
y_train = train_set[target_col]

X_test = test_set[[c for c in dataset.columns if c not in ["PRP", "ERP"]]]
y_test = test_set[target_col]

In [70]:
full_pipeline = Pipeline([
    ("Preprocess", preprocess),
    ("model", models_pipeline)
])

In [71]:
full_pipeline.fit(X_train, y_train)

## Model evaluation

In [72]:
y_hat_train = full_pipeline.predict(X_train)
y_hat_test = full_pipeline.predict(X_test)

In [73]:
full_pipeline.score(X_train, y_train)

0.989192896808148

In [74]:
# train set MSE
mean_squared_error(y_hat_train, y_train)

272.9201055304982

In [75]:
# test set MSE
mean_squared_error(y_hat_test, y_test)

138255.70181170222

# Experiment 3

In [76]:
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))

## Model selection: bootstrapping

In [77]:
# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

In [78]:
results = []
names = []
for name, model in models:
    model_pipeline = Pipeline([
        ("Preprocess", preprocess),
        ("model", model)
    ])
    
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: -7651678.289527 (22416773.347439)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


LASSO: -48026.822621 (82020.095078)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


EN: -26054.689731 (40085.011704)
KNN: -9858.596809 (15243.625615)
CART: -8780.578676 (12835.208837)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SVR: -28364.387542 (26489.709689)


  y = column_or_1d(y, warn=True)
