### Extract year from one file

In [1]:
import re

def get_year(filename):
    year = re.findall('(\d{4}).*', filename)[0]
    year = int(year)
    return year

filename = 'PENNIES/1960_s/1960PennyLincolnUp.LampE5.5.200Scan.07172019.P1.ChangedAngle_HRD10591_13-10-22-820.txt'
get_year(filename)    

1960

### Extract data from one file

In [2]:
import pandas as pd

def get_data(filename):
    df = pd.read_csv(filename, skiprows=13, sep='\t', names=['freq', 'intensity'])
    df = df.set_index('freq')
    return df

df = get_data(filename)
df.head()

Unnamed: 0_level_0,intensity
freq,Unnamed: 1_level_1
223.165,-9
223.4,-9
223.635,-9
223.869,-9
224.104,-7


### Process all files

In [3]:
from pathlib import Path

intensities = []
years = []

for filename in Path('PENNIES').glob('**/*.txt'):
    year = get_year(filename.name)
    years.append(year)
    df = get_data(filename)
    intensities.append(df['intensity'])

In [4]:
df = pd.concat(intensities, axis=1).transpose()
#df.shape, len(years)
df

freq,223.165,223.4,223.635,223.86900000000003,224.104,224.338,224.57299999999998,224.808,225.042,225.27700000000002,...,671.07,671.2719999999999,671.475,671.677,671.88,672.082,672.284,672.487,672.689,672.8919999999999
intensity,-15.62,-15.62,-15.62,-5.62,-12.62,-32.62,-20.62,-21.62,-11.62,-17.62,...,74.38,89.38,77.38,83.38,78.38,75.38,78.38,88.38,77.38,72.38
intensity,-3.25,-3.25,-3.25,-11.25,-34.25,-10.25,-22.25,-8.25,-20.25,-3.25,...,83.75,92.75,82.75,86.75,79.75,72.75,96.75,92.75,69.75,67.75
intensity,-21.62,-21.62,-21.62,-19.62,-20.62,-29.62,-14.62,-4.62,2.38,0.38,...,90.38,84.38,74.38,77.38,75.38,77.38,86.38,75.38,91.38,81.38
intensity,-19.62,-19.62,-19.62,-17.62,-20.62,-25.62,-16.62,-13.62,-16.62,-1.62,...,85.38,85.38,84.38,86.38,76.38,83.38,89.38,91.38,87.38,84.38
intensity,-3.69,-3.69,-3.69,-12.69,-17.69,-12.69,-13.69,-16.69,-7.69,-10.69,...,88.31,84.31,79.31,84.31,80.31,83.31,87.31,79.31,78.31,85.31
intensity,-20.75,-20.75,-20.75,-30.75,-11.75,-16.75,-15.75,-9.75,-1.75,-0.75,...,89.25,81.25,86.25,85.25,84.25,83.25,84.25,91.25,86.25,81.25
intensity,-6.56,-6.56,-6.56,-19.56,-18.56,-11.56,-22.56,-20.56,-7.56,-13.56,...,80.44,78.44,81.44,74.44,65.44,80.44,78.44,74.44,83.44,74.44
intensity,-27.94,-27.94,-27.94,-8.94,-21.94,-2.94,-1.94,-23.94,-16.94,3.06,...,100.06,80.06,95.06,83.06,91.06,80.06,80.06,83.06,87.06,90.06
intensity,-25.56,-25.56,-25.56,-20.56,-16.56,-20.56,-7.56,-3.56,-8.56,-14.56,...,84.44,84.44,86.44,78.44,72.44,72.44,70.44,79.44,85.44,72.44
intensity,-12.62,-12.62,-12.62,-6.62,-17.62,-17.62,-14.62,-23.62,-14.62,4.38,...,80.38,97.38,88.38,74.38,75.38,73.38,80.38,71.38,76.38,84.38


### Train and test models 

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = [
    KNeighborsClassifier(3),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel="linear", C=0.025),
]

X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = years
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
for clf in classifiers:
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    name = clf.__class__.__name__
    print(name, score)

KNeighborsClassifier 0.9146537842190016
SVC 0.9859098228663447
SVC 0.06521739130434782
DecisionTreeClassifier 0.23752012882447665
RandomForestClassifier 0.9432367149758454
MLPClassifier 0.9710144927536232
AdaBoostClassifier 0.0966183574879227
GaussianNB 0.5668276972624798


### Test a new file

In [140]:
filename = 'PENNIES/1980_s/1986.PennyLincolnUp.Lamp.En5.5.200Scan.071819.P1_HRD10591_17-08-12-973.txt'
intensities = get_data(filename).transpose()
intensities = scaler.transform(intensities)
year = clf.predict(intensities)[0]
year

1986