### Extract year from one file

In [2]:
import re

def get_element_info(filename):
    filename = filename.lower()
    #liquid->0, solid->1
    if "cu(no3)2" in filename:
        return 0, 'Copper'
    if "zn(no3)2" in filename:
        return 0, 'Zinc'
    if "ni(no3)2" in filename:
        return 0, 'Nickel'
    if "copper" in filename:
        return 1, 'Copper'
    if "unknownnail" in filename:
        return 1, 'Iron'
    if "nickelspatula" in filename:
        return 1, 'Nickel'
    if "zinc" in filename:
        return 1, 'Zinc'
    else:
        return NaN
    

filenames = ["0.5MCu(NO3)2.Standard.Lamp.E10.CV._HRD10591_16-07-00-754.txt",
           "copper_HRD10591_16-49-14-800.txt",
           "copper.mounting.energy7_HRD10591_16-51-02-544.txt",
           "FlinnAP5935BZINC.07152019.LAMPEdge.EN5.5scans.P1_HRD10591_14-59-59-580.txt",
           "FlinnAP9092KCOPPER.07152019.LAMP.EN7.P1_HRD10591_14-44-41-613.txt",
           "Ni(No3)2Standard.Agarose3_.GreenCap.LampEn10.200Scan.07232019.P1_HRD10591_13-18-38-664.txt",
            "NickelSpatula_HRD10591_16-33-32-126.txt",
            "Zn(NO3)2Standard.Agarose3_.GreenCap.LampEnergy5.5.E.O_HRD10591_15-02-32-768.txt",
            "UnknownNail75mm.07152019.QswitchSynchronous.EN10.5scans.P1_HRD10591_15-21-15-324.txt"]

for file in filenames:
    print(get_element_info(file))

(0, 'Copper')
(1, 'Copper')
(1, 'Copper')
(1, 'Zinc')
(1, 'Copper')
(0, 'Nickel')
(1, 'Nickel')
(0, 'Zinc')
(1, 'Iron')


### Extract data from one file

In [3]:
import pandas as pd

def get_data(filename):
    df = pd.read_csv(filename, skiprows=13, sep='\t', names=['freq', 'intensity'])
    df = df.set_index('freq')
    return df

#for file in filenames:
    #print(get_data("All Data/" + file).head())

### Process all files

In [4]:
from pathlib import Path

intensities = []
states_of_matter = []
elements = []

for filename in Path('All Data').glob('**/*.txt'):
    element_info = get_element_info(filename.name)
    states_of_matter.append(element_info[0])
    elements.append(element_info[1])
    df = get_data(filename)
    intensities.append(df['intensity'])

In [5]:
df = pd.concat(intensities, axis=1).transpose()
#df["State of Matter"] = states_of_matter
df.shape, len(elements), len(states_of_matter)

((1714, 2048), 1714, 1714)

### Train and test models 

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = [
    KNeighborsClassifier(3),
    #SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel="linear", C=0.025),
    GradientBoostingClassifier()
]

X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = elements
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
for clf in classifiers:
    clf.fit(X_train, y_train)
    score = np.mean(cross_val_score(clf, X_test, y_test))
    name = clf.__class__.__name__
    print(name, score)

KNeighborsClassifier 0.8352911263666413
DecisionTreeClassifier 0.8557843885075007
RandomForestClassifier 0.9708619374523265
MLPClassifier 0.883447749809306
AdaBoostClassifier 0.781121281464531
GaussianNB 0.26526824307144675
SVC 0.8426264937706586
GradientBoostingClassifier 0.9709127892194255


### Multi-Label Classifier

#### Creating 4 seperate label lists in order to train 4 binary classifiers

In [14]:
print(elements)

['Nickel', 'Zinc', 'Nickel', 'Zinc', 'Copper', 'Nickel', 'Zinc', 'Nickel', 'Copper', 'Nickel', 'Zinc', 'Copper', 'Nickel', 'Zinc', 'Nickel', 'Nickel', 'Nickel', 'Zinc', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Copper', 'Zinc', 'Nickel', 'Zinc', 'Zinc', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Copper', 'Zinc', 'Copper', 'Zinc', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Zinc', 'Nickel', 'Zinc', 'Copper', 'Nickel', 'Zinc', 'Nickel', 'Nickel', 'Nickel', 'Zinc', 'Nickel', 'Zinc', 'Nickel', 'Copper', 'Nickel', 'Nickel', 'Copper', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Zinc', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Zinc', 'Zinc', 'Copper', 'Nickel', 'Zinc', 'Copper', 'Copper', 'Nickel', 'Nickel', 'Nickel', 'Nickel', 'Zinc', 'Nickel', 'Copper', 'Zinc', 'Zinc', 'Copper', 'Zinc', 'Nickel', 'Zinc', 'Copper', 'Nickel', 'Zinc', 'Nickel', 'Nickel', 'Nickel', 'Zinc', 'Nickel', 'Nickel', 'Copper', 'Ni

In [32]:
copper_elements = []
iron_elements = []
zinc_elements = []
nickel_elements = []

for element in elements:
    element = element.lower()
    if element == "copper":
        copper_elements.append(1)
        iron_elements.append(0)
        zinc_elements.append(0)
        nickel_elements.append(0)
    if element == "iron":
        iron_elements.append(1)
        copper_elements.append(0)
        zinc_elements.append(0)
        nickel_elements.append(0)
    if element == "zinc":
        zinc_elements.append(1)
        copper_elements.append(0)
        iron_elements.append(0)
        nickel_elements.append(0)
    if element == "nickel":
        nickel_elements.append(1)
        zinc_elements.append(0)
        copper_elements.append(0)
        iron_elements.append(0)

len(elements), len(copper_elements), len(iron_elements), len(zinc_elements), len(nickel_elements)

(1714, 1714, 1714, 1714, 1714)

In [None]:
#create one binary classifier for each element using the sets just created
#change the y values

In [None]:
classifiers = [
    KNeighborsClassifier(3),
    #SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel="linear", C=0.025),
    GradientBoostingClassifier()
]

X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = iron_elements
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
for clf in classifiers:
    clf.fit(X_train, y_train)
    score = np.mean(cross_val_score(clf, X_test, y_test))
    name = clf.__class__.__name__
    print(name, score)

### Testing Models

In [11]:
filenames = ["NickelSpatula_HRD10591_16-33-32-126.txt",
            "Zn(NO3)2Standard.Agarose3_.GreenCap.LampEnergy5.5.E.O_HRD10591_15-02-32-768.txt",
            "UnknownNail75mm.07152019.QswitchSynchronous.EN10.5scans.P1_HRD10591_15-21-15-324.txt"]


test_states_of_matter = []
test_dfs = []
test_elements = []

for file in filenames:
    test_states_of_matter.append(get_element_info("All Data/" + file)[0])
    test_elements.append(get_element_info("All Data/" + file)[1])
    test_dfs.append(get_data("All Data/" + file))
    #print(get_data("All Data/" + file).head(1).values)

test_df = pd.concat(test_dfs, axis=1).transpose()
#test_df["State of Matter"] = states_of_matter

test_data = test_df.values
scaler = StandardScaler()
test_data = scaler.fit_transform(test_data)

for clf in classifiers:
    name = clf.__class__.__name__
    print(name, clf.score(test_data, test_elements))

KNeighborsClassifier 0.6666666666666666
DecisionTreeClassifier 1.0
RandomForestClassifier 0.6666666666666666
MLPClassifier 0.3333333333333333
AdaBoostClassifier 0.6666666666666666
GaussianNB 0.3333333333333333
SVC 0.6666666666666666
GradientBoostingClassifier 1.0


### Fine tune RandomForestClassifier