# Video 1: Decision Trees

In [5]:
import pandas as pd

from ucimlrepo import fetch_ucirepo

# fetch dataset
superconductivty_data = fetch_ucirepo(id=464)

# data (as pandas dataframes)
X = superconductivty_data.data.features
y = superconductivty_data.data.targets

df = X.join(y)

df.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [7]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=16)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8781264106508513

In [8]:
from sklearn import tree
model.tree_.max_depth

16

# Video 2: Different models

In [9]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7358731290835475

In [12]:
from sklearn.model_selection import cross_val_score

def regressor(model):
    scores = cross_val_score(
        model,
        X = X,
        y = y,
        cv = 5,
        #scoring = 'accuracy'
    )
    return scores

In [13]:
regressor(LinearRegression())

array([ 0.42607132,  0.46183832,  0.67252785, -0.52033573,  0.49608833])

In [14]:
from sklearn.linear_model import Ridge

regressor(Ridge(alpha=0.1))
#seems to always be at least slightly worse

array([ 0.42592947,  0.46156587,  0.67302201, -0.51593852,  0.49599502])

In [17]:
from sklearn.neighbors import KNeighborsRegressor

regressor(KNeighborsRegressor(n_neighbors=5))

array([0.54919364, 0.43611803, 0.7615264 , 0.51428525, 0.20291475])

In [18]:
from sklearn.ensemble import AdaBoostRegressor

regressor(AdaBoostRegressor())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([ 0.34859557,  0.44471352,  0.67704355, -1.13352023,  0.42245036])

In [19]:
from sklearn.neural_network import MLPRegressor

regressor(MLPRegressor())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([ 0.44119681,  0.03813839, -1.48806336, -5.43818926, -0.5091175 ])

# Video 4: K-Nearest Neighbors

In [67]:
df.shape

(21263, 82)

In [20]:
from sklearn.neighbors import KNeighborsRegressor

for i in range(1, 10):
    print(i)
    print(regressor(KNeighborsRegressor(n_neighbors=i)))

1
[0.44211141 0.33468979 0.70177628 0.45252865 0.22765713]
2
[0.50756569 0.40460698 0.75195253 0.50620581 0.25181932]
3
[0.5350799  0.41456241 0.76782863 0.53077941 0.2390144 ]
4
[0.54445477 0.42566801 0.76192052 0.52091782 0.22602579]
5
[0.54919364 0.43611803 0.7615264  0.51428525 0.20291475]
6
[0.55386218 0.4443672  0.7588462  0.49371632 0.21307639]
7
[0.55222127 0.44584236 0.75391346 0.48124111 0.19860788]
8
[0.54970999 0.44942864 0.74895679 0.46820716 0.20517849]
9
[0.54996714 0.45105612 0.74530651 0.46273567 0.23478998]


# Video 5: Normalize Data

In [21]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,mean_Valence,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence
0,0.375,0.405879,0.25395,0.29972,0.164988,0.595724,0.542536,0.590973,0.154652,0.514443,...,0.208333,0.209524,0.202227,0.203297,0.639097,0.546853,0.166667,0.155275,0.144338,0.145686
1,0.5,0.424611,0.257187,0.332968,0.166341,0.730573,0.540166,0.590973,0.175894,0.466193,...,0.166667,0.209524,0.148029,0.20178,0.726956,0.537109,0.333333,0.161404,0.210819,0.156202
2,0.375,0.405879,0.254061,0.29972,0.165017,0.595724,0.498406,0.590973,0.173847,0.514443,...,0.208333,0.211905,0.202227,0.205446,0.639097,0.527853,0.166667,0.159361,0.144338,0.148232
3,0.375,0.405879,0.254005,0.29972,0.165002,0.595724,0.522056,0.590973,0.164249,0.514443,...,0.208333,0.210714,0.202227,0.20437,0.639097,0.537936,0.166667,0.157318,0.144338,0.146984
4,0.375,0.405879,0.253838,0.29972,0.16496,0.595724,0.576663,0.590973,0.135458,0.514443,...,0.208333,0.207143,0.202227,0.20116,0.639097,0.562153,0.166667,0.151189,0.144338,0.142936


In [70]:
X_normalized_train, X_normalized_test, y_train, y_test = train_test_split(X_normalized, y, random_state=0)

In [24]:
def regressor_normalized(model):
    scores = cross_val_score(
        model,
        X = X_normalized,
        y = y,
        cv = 5,
        #scoring = 'accuracy'
    )
    return scores

In [None]:
from sklearn.neighbors import KNeighborsRegressor

for i in range(1, 10):
    print(i)
    print(regressor_normalized(KNeighborsRegressor(n_neighbors=i)))

1
[0.48037994 0.48807606 0.78512508 0.58347153 0.52494235]
2
[0.57855427 0.53606598 0.80615067 0.72776658 0.56351835]
3
[0.60802831 0.54638592 0.81953325 0.74170111 0.57851012]
4
[0.60896089 0.54919983 0.82253708 0.75374855 0.58654457]
5
[0.61446786 0.55585615 0.82473489 0.74527929 0.58161496]
6
[0.61544311 0.560469   0.82822111 0.74365796 0.58942603]
7
[0.62078059 0.56527137 0.82931398 0.74509768 0.59209838]
8
[0.62050665 0.56539275 0.82770488 0.74623238 0.59021465]
9
[0.62047589 0.5688474  0.82972568 0.74515813 0.5884083 ]


cannot do video 6 because heart disease data is no longer available