# Video 1: First Decision Tree Model

In [5]:
import pandas as pd

from ucimlrepo import fetch_ucirepo

# fetch dataset
superconductivty_data = fetch_ucirepo(id=464)

# data (as pandas dataframes)
X = superconductivty_data.data.features
y = superconductivty_data.data.targets

df = X.join(y)

df.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0


In [None]:
# make df discrete:

df_discrete = df

for i in df.columns:
    o = df[i].values
    new_column = (o > df[i].median()).astype(int)
    df_discrete[i] = new_column

X_discrete = df_discrete.iloc[:, :-1]
y_discrete = df_discrete['critical_temp']

df_discrete.head()

In [181]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

model = DecisionTreeRegressor()

model.fit(X, y)

In [13]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_discrete, y_discrete)

model.score(X_discrete, y_discrete)

0.9552273903024032

# Video 3: Train Test Split

In [19]:
from sklearn.model_selection import train_test_split

X_discrete_train, X_discrete_test, y_discrete_train, y_discrete_test = train_test_split(X_discrete, y_discrete)

model = DecisionTreeClassifier()

model.fit(X_discrete_train, y_discrete_train)

model.score(X_discrete_test, y_discrete_test)

0.9302106847253574

# Video 4: Adjusting Parameters

In [20]:
model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [21]:
from sklearn import tree
model.tree_.max_depth

24

In [None]:
model = DecisionTreeClassifier(max_depth=16)
# tried a few different max depths, 16 seems to give the best scores

model.fit(X_discrete_train, y_discrete_train)

model.score(X_discrete_test, y_discrete_test)

0.9326561324303988

In [None]:
model = DecisionTreeClassifier(max_depth=16, min_samples_leaf=5)
# score seems to go down with min samples leaf, but consistency is maybe increasing?
# variation in score seems to be lower between runs

model.fit(X_discrete_train, y_discrete_train)

model.score(X_discrete_test, y_discrete_test)

0.9243792325056434

In [None]:
from sklearn.model_selection import train_test_split

X_discrete_train, X_discrete_test, y_discrete_train, y_discrete_test = train_test_split(X_discrete, y_discrete, test_size=0.1)

model = DecisionTreeClassifier(max_depth=16)

model.fit(X_discrete_train, y_discrete_train)

model.score(X_discrete_test, y_discrete_test)

0.9445228020686413

# Video 5: Model Predictions

In [220]:
sample = df.sample(1)
sample

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
16308,0,0,1,1,1,0,0,0,0,0,...,1,1,1,0,0,0,1,0,0,0


In [222]:
sampleX = sample.iloc[:,:-1]
model.predict(sampleX)

array([0])

In [223]:
sampleX

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,mean_Valence,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence
16308,0,0,1,1,1,0,0,0,0,0,...,1,1,1,1,0,0,0,1,0,0


In [224]:
sampleX['wtd_std_ThermalConductivity']

16308    0
Name: wtd_std_ThermalConductivity, dtype: int64

In [225]:
sampleX['wtd_std_ThermalConductivity'] = 1

In [226]:
model.predict(sampleX)

array([0])

In [228]:
sampleX['entropy_atomic_mass'] = 1

model.predict(sampleX)

array([0])