In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import tree
from dtreeviz.trees import dtreeviz
from sklearn.model_selection import train_test_split, cross_validate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('./datasets/machine.data', header=None)

In [3]:
df.columns = ["vendor_name", "model_name", "cycle_time", "MMin", "MMax", "cache", "ChMin", "ChMax", "pub_performance", "est_performance"]
df

Unnamed: 0,vendor_name,model_name,cycle_time,MMin,MMax,cache,ChMin,ChMax,pub_performance,est_performance
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...,...
204,sperry,80/8,124,1000,8000,0,1,8,42,37
205,sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
206,sratus,32,125,2000,8000,0,2,14,52,41
207,wang,vs-100,480,512,8000,32,0,0,67,47


In [4]:
with open("./datasets/machine.names") as f:
    print(f.read())

1. Title: Relative CPU Performance Data 

2. Source Information
   -- Creators: Phillip Ein-Dor and Jacob Feldmesser
     -- Ein-Dor: Faculty of Management; Tel Aviv University; Ramat-Aviv; 
        Tel Aviv, 69978; Israel
   -- Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779   
   -- Date: October, 1987
 
3. Past Usage:
    1. Ein-Dor and Feldmesser (CACM 4/87, pp 308-317)
       -- Results: 
          -- linear regression prediction of relative cpu performance
          -- Recorded 34% average deviation from actual values 
    2. Kibler,D. & Aha,D. (1988).  Instance-Based Prediction of
       Real-Valued Attributes.  In Proceedings of the CSCSI (Canadian
       AI) Conference.
       -- Results:
          -- instance-based prediction of relative cpu performance
          -- similar results; no transformations required
    - Predicted attribute: cpu relative performance (numeric)

4. Relevant Information:
   -- The estimated relative performance values were estimated by the autho

In [5]:
X = df.drop(["vendor_name", "model_name", "est_performance", "pub_performance"], axis=1)
Y = df["pub_performance"]

In [6]:
x = X.values
y = Y.values

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=42)
print(X_train[:10])
print(Y_train[:10])

[[   40  8000 16000    32     8    16]
 [   26 16000 32000    64     8    24]
 [   30  8000 64000   128    12   176]
 [  105  1000  4000     0     3    24]
 [   30 16000 32000   256    16    24]
 [  140  2000  4000     0     4     8]
 [  203  1000  2000     0     1     5]
 [   50  2000  8000     8     1     5]
 [  320   128  6000     0     1    12]
 [  240   512  2000     8     1     5]]
[ 214  465 1150   32  510   40   24   71   23   11]


In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_train

array([[-0.62496685,  1.55135764,  0.41961313,  0.17612962,  0.65310303,
        -0.05573806],
       [-0.67735929,  3.93007968,  1.8983401 ,  0.94826802,  0.65310303,
         0.28584934],
       [-0.66239002,  1.55135764,  4.85579404,  2.4925448 ,  1.34891296,
         6.77601003],
       ...,
       [-0.25073521, -0.23268388, -0.6894321 , -0.40297417, -0.56456434,
         0.11505564],
       [-0.17588888, -0.53002414, -0.31975036, -0.20993957, -0.56456434,
        -0.14113492],
       [ 3.34188863, -0.67512618, -0.92048319, -0.59600877, -0.56456434,
        -0.69621445]])

In [9]:
clf = RandomForestClassifier(n_estimators=1000, random_state=77)
clf.fit(X_train, Y_train)

RandomForestClassifier(n_estimators=1000, random_state=77)

In [10]:
clf.predict(X_test)

array([ 368,   24,   16, 1150,   20,  173,   72,   24,  636,  144,   54,
        113,   42,   24,   50,   50,   32,   32,  138,   42,   50,   50,
        138,   20,   11,   24,   16,   29,   32,   92,  133,   22,   50,
         32,   38,   32,   71,  173,   50,   12,  212,   42])

In [11]:
print(clf.score(X_train, Y_train))
print(clf.score(X_test, Y_test))

0.9101796407185628
0.07142857142857142
