# MAE on Conventional Methods

In [1]:
from collections import Counter
import re, os
import math
import operator
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, r2_score

## Preparation for Pre-processing 

In [2]:
#Contains 86 elements (Without Noble elements as it does not forms compounds in normal condition)
elements = ['H','Li','Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl',
            'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe','Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge',
            'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd',
            'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
            'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er','Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 
            'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu']

In [3]:
# Regex to Choose from Element Name, Number and Either of the Brackets
token = re.compile('[A-Z][a-z]?|\d+|[()]')

# Create a dictionary with the Name of the Element as Key and No. of elements as Value
def count_elements(formula):
    tokens = token.findall(formula)
    stack = [[]]
    for t in tokens:
        if t.isalpha():
            last = [t]
            stack[-1].append(t)
        elif t.isdigit():
             stack[-1].extend(last*(int(t)-1))
        elif t == '(':
            stack.append([])
        elif t == ')':
            last = stack.pop()
            stack[-1].extend(last)   
    return dict(Counter(stack[-1]))

In [4]:
#Normalize the Value of the Dictionary
def normalize_elements(dictionary):
    factor=1.0/sum(dictionary.values())
    for k in dictionary:
        dictionary[k] = dictionary[k]*factor
    return dictionary

In [5]:
#Optional Function
def vectorize_elements(dictionary):
    keys = np.array(list(dictionary.keys()))
    values = np.array(list(dictionary.values()))
    vector_elem = np.vstack((keys, values)).T
    return vector_elem

## Test of the pre-processing functions

In [6]:
#Check Making of the dictionary
elem = count_elements("HF(Mg(H2F3)2As(H2F3)3(Cl((H2F3)3(Br(H2F3)4I)2)3Na)4Cs(H2F3)2)5")
print(elem)

{'H': 1391, 'F': 2086, 'Mg': 5, 'As': 5, 'Cl': 20, 'Br': 120, 'I': 120, 'Na': 20, 'Cs': 5}


In [7]:
#Check the normalization of the values
norm_elem = normalize_elements(elem)
print(elem)

{'H': 0.36876988335100747, 'F': 0.5530222693531284, 'Mg': 0.001325556733828208, 'As': 0.001325556733828208, 'Cl': 0.005302226935312832, 'Br': 0.03181336161187699, 'I': 0.03181336161187699, 'Na': 0.005302226935312832, 'Cs': 0.001325556733828208}


In [8]:
# List of compound as input
compounds = ['HF(Mg(H2F3)2As(H2F3)3(Cl((H2F3)3(Br(H2F3)4I)2)3Na)4Cs(H2F3)2)5','Ca3(Co(CO3)3)2']

#Make Dictionary out of the list of compound
compounds = [count_elements(x) for x in compounds]

#Normalize the value of dictionary
compounds = [normalize_elements(x) for x in compounds]

In [9]:
compounds

[{'As': 0.001325556733828208,
  'Br': 0.03181336161187699,
  'Cl': 0.005302226935312832,
  'Cs': 0.001325556733828208,
  'F': 0.5530222693531284,
  'H': 0.36876988335100747,
  'I': 0.03181336161187699,
  'Mg': 0.001325556733828208,
  'Na': 0.005302226935312832},
 {'C': 0.20689655172413793,
  'Ca': 0.10344827586206896,
  'Co': 0.06896551724137931,
  'O': 0.6206896551724138}]

In [10]:
#Pre-process the dictionary to create a vector to make it suitable as an input for the DNN
in_elements = np.zeros(shape=(len(compounds), len(elements)))
comp_no = 0

for compound in compounds:
    keys = compound.keys()
    for key in keys:
        in_elements[comp_no][elements.index(key)] = compound[key]
    comp_no+=1  
    
data = in_elements

In [11]:
data

array([[0.36876988, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.55302227, 0.00530223, 0.00132556,
        0.        , 0.        , 0.        , 0.        , 0.00530223,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00132556,
        0.        , 0.03181336, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.03181336,
        0.        , 0.00132556, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

## Getting the Train and Test data ready 

In [12]:
# import training data 
def load_data(csvname):
    # load in data
    data = np.asarray(pd.read_csv(csvname))

    # import data and reshape appropriately
    X = data[:,0:-1]
    y = data[:,-1]
    y.shape = (len(y),1)
    
    # pad data with ones for more compact gradient computation
    #o = np.ones((np.shape(X)[0],1))
    #X = np.concatenate((o,X),axis = 1)
    #X = X.T
    
    return X,y

In [13]:
def convert(lst): 
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)} 
    return res_dct 

In [14]:
separate = re.compile('[A-Z][a-z]?|\d+\.\d')

def correction(x_train):
    new_x = []
    for i in range (0,x_train.shape[0]):
        new_x.append(separate.findall(x_train[i][0])) 
    new_x = np.asarray(new_x)
    new_x.shape = (len(new_x),1)    
    dict_x = convert(new_x[0][0])
    input_x = []
    for i in range (0,new_x.shape[0]):
        input_x.append(convert(new_x[i][0]))
        
    in_elements = np.zeros(shape=(len(input_x), len(elements)))
    comp_no = 0

    for compound in input_x:
        keys = compound.keys()
        for key in keys:
            in_elements[comp_no][elements.index(key)] = compound[key]
        comp_no+=1  

    data = in_elements    
    
    return data

In [15]:
x_train, y_train = load_data('train_set.csv')
x_test, y_test = load_data('test_set.csv')

In [16]:
new_x_train = correction(x_train)
new_x_test = correction(x_test)

In [17]:
new_y_train = y_train
new_y_test = y_test

In [18]:
new_y_train.shape = (len(new_y_train),)
new_y_test.shape = (len(new_y_test),)

## Linear Regression

In [68]:
from sklearn.linear_model import LinearRegression

In [69]:
lr = linear_model.LinearRegression().fit(new_x_train, new_y_train)

In [70]:
y_pred_lr = lr.predict(new_x_test)

In [71]:
MAE_LR = mean_absolute_error(new_y_test, y_pred_lr)

In [72]:
MAE_LR

0.3232836368103759

## SGD Regression

In [73]:
from sklearn.linear_model import SGDClassifier

In [74]:
sgdr = linear_model.SGDRegressor(max_iter=1000, tol=1e-5).fit(new_x_train, new_y_train)

In [75]:
y_pred_sgdr = sgdr.predict(new_x_test)

In [76]:
MAE_SGDR = mean_absolute_error(new_y_test, y_pred_sgdr)

In [77]:
MAE_SGDR

0.32392170895962263

## Elastic Net

In [19]:
from sklearn.linear_model import ElasticNet

In [20]:
en = linear_model.ElasticNet(random_state=86).fit(new_x_train, new_y_train)

In [21]:
y_pred_en = en.predict(new_x_test)

In [22]:
MAE_EN = mean_absolute_error(new_y_test, y_pred_en)

In [23]:
MAE_EN

0.5500259191955399

## AdaBoost

In [83]:
from sklearn.ensemble import AdaBoostRegressor

In [84]:
ad = AdaBoostRegressor(random_state=0, n_estimators=86).fit(new_x_train, new_y_train) 

In [85]:
y_pred_ad = ad.predict(new_x_test)

In [86]:
MAE_AD = mean_absolute_error(new_y_test, y_pred_ad)

In [87]:
MAE_AD

0.47110451318681695

## Ridge

In [27]:
from sklearn.linear_model import Ridge

In [28]:
r = Ridge(alpha=1.0, random_state = 86).fit(new_x_train, new_y_train)

In [29]:
y_pred_r = r.predict(new_x_test)

In [30]:
MAE_R = mean_absolute_error(new_y_test, y_pred_r)

In [31]:
MAE_R

0.3232600209864702

## RBFSVM

In [19]:
from sklearn.svm import SVR

In [None]:
rbfsvm = SVR(kernel='rbf', C=1.0, gamma='scale', epsilon=.1).fit(new_x_train, new_y_train)

In [None]:
y_pred_rbfsvm = rbfsvm.predict(new_x_test)

In [None]:
MAE_RBFSVM = mean_absolute_error(new_y_test, y_pred_rbfsvm)

In [None]:
MAE_RBFSVM

## DecisionTree

In [93]:
from sklearn.tree import DecisionTreeRegressor

In [127]:
dt = DecisionTreeRegressor(max_depth=86).fit(new_x_train, new_y_train)

In [128]:
y_pred_dt = dt.predict(new_x_test)

In [129]:
MAE_DT = mean_absolute_error(new_y_test, y_pred_dt)

In [130]:
MAE_DT

0.18524110887321077

## ExtraTree

In [19]:
from sklearn.tree import ExtraTreeRegressor

In [20]:
et = ExtraTreeRegressor(max_depth=86).fit(new_x_train, new_y_train)

In [21]:
y_pred_et = et.predict(new_x_test)

In [22]:
MAE_ET = mean_absolute_error(new_y_test, y_pred_et)

In [23]:
MAE_ET

0.1877740160059495

## Bagging

In [104]:
from sklearn.ensemble import BaggingRegressor

In [106]:
b = BaggingRegressor(random_state=86).fit(new_x_train, new_y_train)

In [107]:
y_pred_b = b.predict(new_x_test)

In [108]:
MAE_B = mean_absolute_error(new_y_test, y_pred_b)

In [109]:
MAE_B

0.16006913317172708

## Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
rf = RandomForestRegressor(max_depth=86, random_state=0, n_estimators=86).fit(new_x_train, new_y_train)

In [21]:
y_pred_rf = rf.predict(new_x_test)

In [22]:
MAE_RF = mean_absolute_error(new_y_test, y_pred_rf)

In [23]:
MAE_RF

0.1578311209238195