# Notebook to clean SEER incidence data

In [1]:
!pwd

/Users/patxiestomba/canceritos/notebooks


In [2]:
import pandas as pd

In [3]:
columns_file = "../data/raw/seer/incidence/tabula-seerdic-7-9.csv"

In [4]:
columns_locations = pd.read_csv(columns_file, sep=";")
columns_locations

Unnamed: 0,Item Name,Positions,Length
0,Patient ID number,1-8,8
1,Registry ID,9-18,10
2,Marital Status at DX,19-19,1
3,Race/Ethnicity,20-21,2
4,Spanish/Hispanic Origin,22-22,1
5,NHIA Derived Hispanic Origin,23-23,1
6,Sex,24-24,1
7,Age at diagnosis,25-27,3
8,Year of Birth,28-31,4
9,Birth Place,32-34,3


In [5]:
import re

In [6]:
data_file = '../data/raw/seer/incidence/yr1973_2008.seer9/BREAST.TXT'

In [7]:
with open(data_file) as fh:
    data_raw = fh.read()

In [8]:
lines = data_raw.split('\n')

In [9]:
field_lengths = columns_locations['Length'].values

In [10]:
field_lengths

array([ 8, 10,  1,  2,  1,  1,  1,  3,  4,  3,  2,  2,  4,  4,  1,  4,  1,
        4,  1,  1,  1,  1,  3,  2,  2,  1,  2,  2, 13,  2,  4,  1,  1,  1,
        1,  3,  3,  3,  2,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  1,
        1,  1,  1,  1,  6,  6,  6,  2,  1,  1,  2,  1,  1,  1,  1,  1,  2,
        2,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  2,  5,  5,  4,  4,  3,  3,  3,  3,  1,  1,  2,  2,  3,  1,  1,
        1,  1,  2,  2,  1,  1,  2,  1,  5,  4,  5,  5,  1,  1,  1,  2,  2,
        1,  1,  1,  1,  1])

In [11]:
regex = ''
for field_len in field_lengths:
    regex += '(.{' + str(field_len) + '})'
regex += '\\n'

In [12]:
all_data = re.findall(regex, data_raw)

In [13]:
column_names = columns_locations['Item Name'].values

In [14]:
df = pd.DataFrame(all_data, columns=column_names)

In [15]:
df.head()

Unnamed: 0,Patient ID number,Registry ID,Marital Status at DX,Race/Ethnicity,Spanish/Hispanic Origin,NHIA Derived Hispanic Origin,Sex,Age at diagnosis,Year of Birth,Birth Place,...,Vital Status recode,IHS Link,Summary stage 2000 (1998+),AYA site recode,Lymphoma subtype recode,SEER Cause-Specific Death Classification,SEER Other Cause of Death Classification,CS Tumor Size/Ext Eval,CS Lymph Nodes Eval,CS Mets Eval
0,7000004,1502,2,1,0,0,2,60,1932,6,...,1,0.0,,36,99,9,9,,,
1,7000046,1502,5,1,0,0,2,76,1920,7,...,1,0.0,,36,99,9,9,,,
2,7000057,1502,5,1,0,0,2,70,1924,5,...,4,0.0,,36,99,9,9,,,
3,7000072,1502,2,1,0,0,2,59,1917,7,...,4,,,36,99,9,9,,,
4,7000096,1502,5,2,0,0,2,61,1924,23,...,4,,,99,99,9,9,,,


In [16]:
def preprocess_file(input_file, output_file):
    with open(input_file) as fh:
        data_raw = fh.read()
    all_data = re.findall(regex, data_raw)
    df = pd.DataFrame(all_data, columns=column_names)
    df.to_csv(output_file)

In [17]:
!ls ../data

[34mexternal[m[m  [34minterim[m[m   [34mprocessed[m[m [34mraw[m[m


In [18]:
root = "../data/raw/seer/incidence"
destiny = "../data/processed/seer/incidence"

files = ['BREAST.TXT',
         'COLRECT.TXT',
         'DIGOTHR.TXT',
         'FEMGEN.TXT',
         'LYMYLEUK.TXT',
         'MALEGEN.TXT',
         'OTHER.TXT',
         'RESPIR.TXT',
         'URINARY.TXT']

folders = ['yr1992_2008.sj_la_rg_ak',
           'yr2005.lo_2nd_half',
           'yr1973_2008.seer9',
           'yr2000_2008.ca_ky_lo_nj',]

In [19]:
import os

In [20]:
!mkdir -p $destiny

In [101]:
for folder in folders:
    dest_folder = os.path.join(destiny, folder)
    os.mkdir(dest_folder)

    for fname in files:
        origin_file = os.path.join(root, folder, fname)
        dest_file = os.path.join(dest_folder, fname)
        print('Transforming {}... '.format(origin_file), end='')
        preprocess_file(origin_file, dest_file)
        print('Done :)')

Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/BREAST.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/COLRECT.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/DIGOTHR.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/FEMGEN.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/LYMYLEUK.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/MALEGEN.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/OTHER.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/RESPIR.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr1992_2008.sj_la_rg_ak/URINARY.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr2005.lo_2nd_half/BREAST.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr2005.lo_2nd_half/COLRECT.TXT... Done :)
Transforming ../data/raw/seer/incidence/yr2005.lo_2nd_half