In [3]:
from matplotlib import pyplot as plt
%matplotlib inline

In [35]:
from __future__ import print_function

import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

In [6]:
file = '20160301_TE_survey_cleaned.csv'

In [7]:
df = pd.read_csv(file)

In [8]:
df['Seebeck(uV/K)'].describe()

count    1098.000000
mean      -40.155270
std       192.691410
min      -752.196000
25%      -163.359500
50%       -67.600000
75%        99.358325
max      1235.430000
Name: Seebeck(uV/K), dtype: float64

### For the decision tree we need to bin the target values, so that we create a mock classification problem.

In [10]:
df['bin'] = pd.cut(df['Seebeck(uV/K)'], 50) #Breaking the Seebeck column into 50
                                            #bins, this number is arbitraty.

In [11]:
df = df.sort_values(['bin'], ascending = True) #sort the dataframe by ascending order
                                                #of the bins.

In [12]:
df['bin'].head()

1098                     NaN
1031    (-754.184, -712.443]
328     (-672.691, -632.938]
1032    (-632.938, -593.186]
124     (-632.938, -593.186]
Name: bin, dtype: category
Categories (50, interval[float64]): [(-754.184, -712.443] < (-712.443, -672.691] < (-672.691, -632.938] < (-632.938, -593.186] ... (1076.42, 1116.172] < (1116.172, 1155.925] < (1155.925, 1195.677] < (1195.677, 1235.43]]

In [19]:
#Pulling the columns we're interested in using for our predictors.
reduced_df = df[['Resist(Ohm.cm)', 'T(K)', 'Seebeck(uV/K)', 'average_atomic_volume', 'space_group', 'bin']]

In [20]:
reduced_df.head()

Unnamed: 0,Resist(Ohm.cm),T(K),Seebeck(uV/K),average_atomic_volume,space_group,bin
1098,,,,,,
1031,2.91889,300.0,-752.196,13.52,139.0,"(-754.184, -712.443]"
328,0.302535,300.0,-650.91,10.3485,62.0,"(-672.691, -632.938]"
1032,2.16164,400.0,-618.425,13.52,139.0,"(-632.938, -593.186]"
124,0.460937,400.0,-600.164,10.3485,62.0,"(-632.938, -593.186]"


### I need to assign each bin a number.  I created a new column with these bin numbers.

In [22]:
def encode_target(df, target_column):
    """Add column to data that assigns integers to the binned Seebeck data"""
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod['Target'] = df_mod[target_column].replace(map_to_int)
    
    return (df_mod, targets)

In [23]:
df2, targets = encode_target(reduced_df, 'bin')

In [24]:
df2.head()

Unnamed: 0,Resist(Ohm.cm),T(K),Seebeck(uV/K),average_atomic_volume,space_group,bin,Target
1098,,,,,,,0
1031,2.91889,300.0,-752.196,13.52,139.0,"(-754.184, -712.443]",1
328,0.302535,300.0,-650.91,10.3485,62.0,"(-672.691, -632.938]",2
1032,2.16164,400.0,-618.425,13.52,139.0,"(-632.938, -593.186]",3
124,0.460937,400.0,-600.164,10.3485,62.0,"(-632.938, -593.186]",3


In [27]:
#Clean up the NA values
df3 = df2.dropna()

In [43]:
#Break data into x and y
df3_x = pd.DataFrame(df3[['Resist(Ohm.cm)', 'T(K)', 'average_atomic_volume', 'space_group']])
df3_y = pd.DataFrame(df3['Target'])

In [59]:
#this just makes it easy to call all x parameters later on.
features = list(df3_x.columns[:4])

In [39]:
print (features)

['Resist(Ohm.cm)', 'T(K)', 'average_atomic_volume', 'space_group']


In [44]:
#Split into test/train
x_train, x_test = train_test_split(df3_x, test_size=0.2)
y_train, y_test = train_test_split(df3_y, test_size=0.2)

In [50]:
#Fit the decision tree,
y_fit = y_train['Target']
X_fit = x_train[features]
dt = DecisionTreeRegressor(min_samples_split = 20, random_state = 99)
dt_fit = dt.fit(X_fit, y_fit)

In [58]:
#Check the predictor.
y_pred = y_test['Target']
X_pred = x_test[features]
dt_pred = dt_fit.score(X_pred, y_pred)
print (dt_pred)

-0.33272193012680695


In [60]:
#I was following along with a demo and the author mentioned a way to visalize
#the tree.  I haven't been able to get this to work yet.
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [48]:
visualize_tree(dt, features)