In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
#creates all the file names
file_name_root = 'Pennies/All_Penny_Data/penny_data_'
all_file_names = []
penny_id = []
number_of_pennies = 6210
for i in range(1,1+number_of_pennies):
    all_file_names.append(file_name_root + str(i) + ".txt")
    penny_id.append(i)
    
len(all_file_names), len(penny_id)

(6210, 6210)

In [3]:
penny_index_names = ['Penny_Year', 'Date_Sample_Taken', "User", "Spectrometer", "Trigger_Mode",
               "Integration_Time_seconds","Scans_To_Average", "Electric_Dark_Correction_Enabled", 
               "Nonlinearity_Correction_enabled","Boxcar_Width", "XAxis_Mode", "Number_Of_Pixels_In_Spectrum"]
def create_penny_series(file_name, get_attributes):
    #creates list with penny data and splits it into 2 lists: attributes and numbers
    penny_lines = [line.strip() for line in open(file_name, 'r') if len(line)>1]
    penny_lines_attributes = penny_lines[:12]
    penny_lines_numbers = penny_lines[13:]
    
    #takes the penny info and creates a pandas series with detailed penny info
    penny_attributes_list = []
    penny_attributes_list.append(re.search(r"\d{4}.*", penny_lines_attributes[0])[0])
    for element in penny_lines_attributes[1:]:
        penny_attributes_list.append("".join(element.split(":")[1:]).strip())
    penny_attributes_series = pd.Series(penny_attributes_list, index=penny_index_names)
    
    #takes the penny number pairs and creates a series of coordinates (x->index, y->values)
    penny_coordinates_x = []
    penny_coordinates_y = []
    penny_coordinates = []
    for element in penny_lines_numbers:
        coordinates = element.split('\t')
        penny_coordinates_x.append(coordinates[0])
        penny_coordinates_y.append(coordinates[1])
        penny_coordinates.append((coordinates[0], coordinates[1]))
    penny_coordinates_series = pd.Series(penny_coordinates)
                            
    
    if get_attributes:
        return penny_attributes_series
    else:
        return penny_coordinates_series

In [4]:
#creates a series for each penny (attributes) and stores it in a list
all_penny_attributes_series = []
for penny in all_file_names:
    all_penny_attributes_series.append(create_penny_series(penny, True))

#creates a dataframe to store all the penny attributes series that were just created (the index is the penny id)
penny_attributes_df = (pd.DataFrame(all_penny_attributes_series, index=penny_id).reset_index()
                       .rename(columns={"index":"Penny_ID"}).set_index("Penny_ID"))

#creates a binary copper column for all the pennys because we know they have copper
penny_attributes_df["Copper?"] = True

In [6]:
#creates all the column names for later ("coordinate 1, coordinate 2, etc")
all_column_names_coordinates = []
column_names_coordinates_root = "Coordinate_"
for i in range(1,2049):
    all_column_names_coordinates.append(column_names_coordinates_root + str(i))

#create a series for each penny (coordinates) and stores it in a list    
all_penny_coordinates_series = []
for penny in all_file_names:
    all_penny_coordinates_series.append(create_penny_series(penny, False))

#creates a dataframe to store all the penny coordinates series that were just created (the index is the penny id) 
penny_coordinates_df = (pd.DataFrame(all_penny_coordinates_series, index=penny_id)
                        .reset_index().rename(columns={"index":"Penny_ID"}).set_index("Penny_ID"))
penny_coordinates_df.columns = all_column_names_coordinates

In [7]:
#merge the two dataframes into 1 so that all the penny data is in one dataframe
all_penny_data_df = penny_attributes_df.merge(penny_coordinates_df, 
                                            how="outer", right_index=True, 
                                            left_index=True)

In [8]:
print(all_penny_data_df.shape, penny_attributes_df.shape, penny_coordinates_df.shape)
all_penny_data_df.head()

(6210, 2061) (6210, 13) (6210, 2048)


Unnamed: 0_level_0,Penny_Year,Date_Sample_Taken,User,Spectrometer,Triger_Mode,Integration_Time_seconds,Scans_To_Average,Electric_Dark_Correction_Enabled,Nonlinearity_Correction_enabled,Boxcar_Width,...,Coordinate_2039,Coordinate_2040,Coordinate_2041,Coordinate_2042,Coordinate_2043,Coordinate_2044,Coordinate_2045,Coordinate_2046,Coordinate_2047,Coordinate_2048
Penny_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1960,Thu Jul 18 131022 EDT 2019,cochatt,HR+D1059,1,0.1,1,True,False,0,...,"(671.07, 103)","(671.272, 81)","(671.475, 81)","(671.677, 93)","(671.88, 87)","(672.082, 90)","(672.284, 83)","(672.487, 92)","(672.689, 94)","(672.892, 91)"
2,1960,Thu Jul 18 131406 EDT 2019,cochatt,HR+D1059,1,0.1,1,True,False,0,...,"(671.07, 83.38)","(671.272, 84.38)","(671.475, 87.38)","(671.677, 90.38)","(671.88, 68.38)","(672.082, 83.38)","(672.284, 87.38)","(672.487, 88.38)","(672.689, 89.38)","(672.892, 86.38)"
3,1960,Thu Jul 18 131407 EDT 2019,cochatt,HR+D1059,1,0.1,1,True,False,0,...,"(671.07, 94.5)","(671.272, 81.5)","(671.475, 75.5)","(671.677, 73.5)","(671.88, 83.5)","(672.082, 84.5)","(672.284, 88.5)","(672.487, 100.5)","(672.689, 77.5)","(672.892, 78.5)"
4,1960,Thu Jul 18 131408 EDT 2019,cochatt,HR+D1059,1,0.1,1,True,False,0,...,"(671.07, 90.31)","(671.272, 92.31)","(671.475, 79.31)","(671.677, 86.31)","(671.88, 71.31)","(672.082, 77.31)","(672.284, 81.31)","(672.487, 86.31)","(672.689, 78.31)","(672.892, 78.31)"
5,1960,Thu Jul 18 131409 EDT 2019,cochatt,HR+D1059,1,0.1,1,True,False,0,...,"(671.07, 93.81)","(671.272, 88.81)","(671.475, 87.81)","(671.677, 78.81)","(671.88, 81.81)","(672.082, 97.81)","(672.284, 85.81)","(672.487, 87.81)","(672.689, 69.81)","(672.892, 72.81)"


In [9]:
all_penny_data_df.to_csv("Penny Data (1960s, 1970s, 1980s).csv")