In [1]:
from matplotlib import pyplot as plt
%matplotlib inline

In [168]:
from __future__ import print_function

import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
file = '20160301_TE_survey_cleaned.csv'

In [4]:
df = pd.read_csv(file)

In [36]:
df.columns.values

array(['ID', 'T(K)', 'Z*10^-4 reported', 'Resist(Ohm.cm)',
       'Seebeck(uV/K)', 'kappaZT', 'Unnamed: 6', 'Resist(400K)',
       'Seebeck(400K)', 'Pf(W/K^2/m)', 'ZT', 'kappa(W/mK)', 'x',
       'Formula', 'series', 'T_Max', 'family', 'Unnamed: 17',
       'Unnamed: 18', 'Conduct(S/cm)', 'Power_Factor*T_(W/mK)',
       'preparative_route', 'final_form', 'cell_volume(A^3)',
       'formula_units_per_cell', 'atoms_per_formula_unit',
       'total_atoms_per_unit_cell', 'average_atomic_volume',
       'ICSD_of_structure', 'temp of ICSD (K)', 'S^2', 'ke/ktotal',
       'space_group', 'number_symmetry_elements', 'Unnamed: 34',
       'Unnamed: 35', 'bin'], dtype=object)

In [5]:
df['Seebeck(uV/K)'].describe()

count    1098.000000
mean      -40.155270
std       192.691410
min      -752.196000
25%      -163.359500
50%       -67.600000
75%        99.358325
max      1235.430000
Name: Seebeck(uV/K), dtype: float64

### For the decision tree we need to bin the target values, so that we create a mock classification problem.

In [6]:
df['bin'] = pd.cut(df['Seebeck(uV/K)'], 50) #Breaking the Seebeck column into 50
                                            #bins, this number is arbitraty.

In [7]:
df = df.sort_values(['bin'], ascending = True) #sort the dataframe by ascending order
                                                #of the bins.

In [8]:
df['bin'].head()

1098                     NaN
1031    (-754.184, -712.443]
328     (-672.691, -632.938]
1032    (-632.938, -593.186]
124     (-632.938, -593.186]
Name: bin, dtype: category
Categories (50, interval[float64]): [(-754.184, -712.443] < (-712.443, -672.691] < (-672.691, -632.938] < (-632.938, -593.186] ... (1076.42, 1116.172] < (1116.172, 1155.925] < (1155.925, 1195.677] < (1195.677, 1235.43]]

In [9]:
#Pulling the columns we're interested in using for our predictors.
reduced_df = df[['Resist(Ohm.cm)', 'T(K)', 'Seebeck(uV/K)', 'average_atomic_volume', 'space_group', 'bin']]

In [10]:
reduced_df.head()

Unnamed: 0,Resist(Ohm.cm),T(K),Seebeck(uV/K),average_atomic_volume,space_group,bin
1098,,,,,,
1031,2.91889,300.0,-752.196,13.52,139.0,"(-754.184, -712.443]"
328,0.302535,300.0,-650.91,10.3485,62.0,"(-672.691, -632.938]"
1032,2.16164,400.0,-618.425,13.52,139.0,"(-632.938, -593.186]"
124,0.460937,400.0,-600.164,10.3485,62.0,"(-632.938, -593.186]"


### I need to assign each bin a number.  I created a new column with these bin numbers.

In [11]:
def encode_target(df, target_column):
    """Add column to data that assigns integers to the binned Seebeck data"""
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod['Target'] = df_mod[target_column].replace(map_to_int)
    
    return (df_mod, targets)

In [12]:
df2, targets = encode_target(reduced_df, 'bin')

In [13]:
df2.head()

Unnamed: 0,Resist(Ohm.cm),T(K),Seebeck(uV/K),average_atomic_volume,space_group,bin,Target
1098,,,,,,,0
1031,2.91889,300.0,-752.196,13.52,139.0,"(-754.184, -712.443]",1
328,0.302535,300.0,-650.91,10.3485,62.0,"(-672.691, -632.938]",2
1032,2.16164,400.0,-618.425,13.52,139.0,"(-632.938, -593.186]",3
124,0.460937,400.0,-600.164,10.3485,62.0,"(-632.938, -593.186]",3


In [14]:
#Clean up the NA values
df3 = df2.dropna()

In [124]:
#Break data into x and y
df3_x = pd.DataFrame(df3[['Resist(Ohm.cm)', 'T(K)', 'average_atomic_volume', 'space_group']])
df3_y = pd.DataFrame(df3['Target'])
df3_reg_y = pd.DataFrame(df3['Seebeck(uV/K)'])

In [16]:
#this just makes it easy to call all x parameters later on.
features = list(df3_x.columns[:4])

In [83]:
print (features)

['Resist(Ohm.cm)', 'T(K)', 'average_atomic_volume', 'space_group']


In [84]:
#Split into test/train
x_train, x_test = train_test_split(df3_x, test_size=0.2)
y_train, y_test = train_test_split(df3_y, test_size=0.2)

In [121]:
#Fit the decision tree,
#1st iteration dt = DecisionTreeRegressor(min_samples_split = 20, random_state = 99)
#Predictor gets better when max depth is decreased, but you loose too much info after a certain point.
y_fit_cla = y_train['Target']
X_fit_cla = x_train[features]
dt_cla = DecisionTreeClassifier(max_depth = 10, random_state = 99)
dt_fit_cla = dt_cla.fit(X_fit_cla, y_fit_cla)

In [122]:
dt_cla_pred = dt_cla.predict(x_test[features])
score = accuracy_score(y_test, dt_pred)
print(score)

0.07009345794392523


In [125]:
#Split into test/train
x_train_reg, x_test_reg = train_test_split(df3_x, test_size=0.2)
y_train_reg, y_test_reg = train_test_split(df3_reg_y, test_size=0.2)

In [170]:
#Fit the decision tree,
#1st iteration dt = DecisionTreeRegressor(min_samples_split = 20, random_state = 99)
#Predictor gets better when max depth is decreased, but you loose too much info after a certain point.
y_fit = y_train_reg['Seebeck(uV/K)']
X_fit = x_train[features]
dt = DecisionTreeRegressor(max_depth = 10, random_state = 99)
dt_fit = dt.fit(x_train_reg, y_train_reg)

In [171]:
#Check the predictor.
dt_pred = dt.predict(x_test[features])
score = dt.score(x_test_reg, y_test_reg)
print (score)

-0.6167586431370182


In [134]:
#Random Forest is a different call, lets  try this out.
rdt = RandomForestRegressor(random_state=1010, n_estimators=100, max_depth = 10)
rdt_fit = rdt.fit(x_train_reg[features], y_train_reg['Seebeck(uV/K)'])
score = rdt.score(x_test_reg, y_test_reg)
print (score)

-0.14706159007854258


In [169]:
#Random Forest is a different call, lets  try this out.
rdt = RandomForestClassifier(random_state=1010, n_estimators=100, max_depth = 10)
rdt_fit = rdt.fit(x_train[features], y_train['Target'])
score = rdt.score(x_test, y_test)
print (score)

0.0514018691588785


In [20]:
#I was following along with a demo and the author mentioned a way to visalize
#the tree.  I haven't been able to get this to work yet.
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [21]:
visualize_tree(dt, features)

In [51]:
def encode_columns(df, df_mod2, target_column):
    """Change unique column values to integers"""
    targets = df_mod2[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod2[target_column] = df_mod2[target_column].replace(map_to_int)
    
    return

In [138]:
df_mod2 = df.copy()
encode_columns(df, df_mod2, 'preparative_route')

In [139]:
df_mod2['preparative_route'].head()

1098    0
1031    1
328     2
1032    3
124     2
Name: preparative_route, dtype: int64

In [140]:
reduced_df2 = df_mod2[['T(K)', 'Resist(Ohm.cm)', 'Seebeck(uV/K)', 'Resist(400K)', 'Conduct(S/cm)', 'preparative_route', 
                    'cell_volume(A^3)', 'average_atomic_volume', 'space_group', 'number_symmetry_elements', 'bin']]

In [141]:
reduced_df3 = reduced_df2.dropna()

In [142]:
df4, targets = encode_target(reduced_df3, 'bin')

In [159]:
df4_x_1 = pd.DataFrame(df4[['T(K)', 'Resist(Ohm.cm)', 'Resist(400K)', 'Conduct(S/cm)', 'preparative_route', 
                    'cell_volume(A^3)', 'average_atomic_volume', 'space_group', 'number_symmetry_elements']])
df4_y_1 = pd.DataFrame(df4['Target'])
df4_reg_y_1 = pd.DataFrame(df4['Seebeck(uV/K)'])

In [162]:
x_train_1, x_test_1 = train_test_split(df4_x_1, test_size=0.2)
y_train_1, y_test_1 = train_test_split(df4_y_1, test_size=0.2)
y_train_reg_1, y_train_reg_2 = train_test_split(df4_reg_y_1, test_size=0.2)

In [152]:
features_1 = list(df4_x_1.columns[:9])

In [153]:
print (features_1)

['T(K)', 'Resist(Ohm.cm)', 'Resist(400K)', 'Conduct(S/cm)', 'preparative_route', 'cell_volume(A^3)', 'average_atomic_volume', 'space_group', 'number_symmetry_elements']


In [154]:
dt_1 = DecisionTreeClassifier(max_depth = 10, random_state = 99)
dt_fit_1 = dt_1.fit(x_train_1[features_1], y_train_1['Target'],)

In [156]:
dt_pred_1 = dt_fit_1.score(x_test_1, y_test_1)
print (dt_pred_1)

0.014705882352941176


In [165]:
rdt_1 = RandomForestRegressor(random_state=1010, n_estimators=100, max_depth = 10)
rdt_fit_1 = rdt_1.fit(x_train_1[features_1], y_train_reg_1['Seebeck(uV/K)'])

In [166]:
rdt_pred_1 = rdt_fit_1.score(x_test_1, y_train_reg_2)
print (rdt_pred)

-0.1205386828809325
