# Video 1: Decision Trees

In [56]:
import pandas as pd

from ucimlrepo import fetch_ucirepo

# fetch dataset
superconductivty_data = fetch_ucirepo(id=464)

# data (as pandas dataframes)
X = superconductivty_data.data.features
y = superconductivty_data.data.targets

df = X.join(y)

df.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0


In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [58]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=16)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8780018621733764

In [59]:
from sklearn import tree
model.tree_.max_depth

16

# Video 2: Different models

In [60]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7358731290835475

In [61]:
def regressor(model):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [62]:
regressor(LinearRegression())

0.7358731290835475

In [63]:
from sklearn.linear_model import Ridge

regressor(Ridge(alpha=0.1))
#seems to always be at least slightly worse

0.7358297673050473

In [64]:
from sklearn.neighbors import KNeighborsRegressor

regressor(KNeighborsRegressor(n_neighbors=3))

0.8718600204439305

In [65]:
from sklearn.ensemble import AdaBoostRegressor

regressor(AdaBoostRegressor())

  y = column_or_1d(y, warn=True)


0.7094385738456129

In [66]:
from sklearn.neural_network import MLPRegressor

regressor(MLPRegressor())

  y = column_or_1d(y, warn=True)


0.6580607133764059

# Video 4: K-Nearest Neighbors

In [67]:
df.shape

(21263, 82)

In [68]:
from sklearn.neighbors import KNeighborsRegressor

for i in range(1, 10):
    print(i)
    print(regressor(KNeighborsRegressor(n_neighbors=i)))

1
0.8522630183087718
2
0.8710002099681426
3
0.8718600204439305
4
0.8675920178163921
5
0.8674644231739941
6
0.8634217110833342
7
0.8595389996540952
8
0.8560720527068761
9
0.8516235533676488


# Video 5: Normalize Data

In [69]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_normalized.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,mean_Valence,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence
0,0.375,0.405879,0.25395,0.29972,0.164988,0.595724,0.542536,0.590973,0.154652,0.514443,...,0.208333,0.209524,0.202227,0.203297,0.639097,0.546853,0.166667,0.155275,0.144338,0.145686
1,0.5,0.424611,0.257187,0.332968,0.166341,0.730573,0.540166,0.590973,0.175894,0.466193,...,0.166667,0.209524,0.148029,0.20178,0.726956,0.537109,0.333333,0.161404,0.210819,0.156202
2,0.375,0.405879,0.254061,0.29972,0.165017,0.595724,0.498406,0.590973,0.173847,0.514443,...,0.208333,0.211905,0.202227,0.205446,0.639097,0.527853,0.166667,0.159361,0.144338,0.148232
3,0.375,0.405879,0.254005,0.29972,0.165002,0.595724,0.522056,0.590973,0.164249,0.514443,...,0.208333,0.210714,0.202227,0.20437,0.639097,0.537936,0.166667,0.157318,0.144338,0.146984
4,0.375,0.405879,0.253838,0.29972,0.16496,0.595724,0.576663,0.590973,0.135458,0.514443,...,0.208333,0.207143,0.202227,0.20116,0.639097,0.562153,0.166667,0.151189,0.144338,0.142936


In [70]:
X_normalized_train, X_normalized_test, y_train, y_test = train_test_split(X_normalized, y, random_state=0)

In [71]:
def regressor_normalized(model):
    model.fit(X_normalized_train, y_train)
    return model.score(X_normalized_test, y_test)

In [72]:
from sklearn.neighbors import KNeighborsRegressor

for i in range(1, 10):
    print(i)
    print(regressor_normalized(KNeighborsRegressor(n_neighbors=i)))

1
0.8789715644143082
2
0.8972208746453952
3
0.901006427440756
4
0.898508889344558
5
0.8966707963314108
6
0.892628405870653
7
0.8906867201843782
8
0.8884216958580968
9
0.8836484782727907


cannot do video 6 because heart disease data is no longer available