In [None]:
import re
from collections import Counter
import datetime as dt
import pandas as pd
import pyarrow as pa

def bad_row_handler(row):
    id = row.text.split('\t', 2)[0]
    print(f'Bad row: id={id}')
    return 'skip'

parse_opts = pa.csv.ParseOptions(delimiter='\t', invalid_row_handler=bad_row_handler)
dfs0 = [
    pd.read_csv(f'data/neiss{yr}.tsv', delimiter='\t', dtype='str', engine='python'
                )
    for yr in range(2020, 2024)
]
df0 = pd.concat(dfs0)
df0['Sex'].value_counts()

In [None]:
df0[df0['Sex'].isin(['4.8516', '15.4438'])]

In [None]:

dfs = [   
    pd.read_csv(f'data/neiss{yr}.tsv', delimiter='\t', dtype_backend='pyarrow', engine='pyarrow'
    )
    for yr in range(2020, 2024)
]
df = pd.concat(dfs)
df

In [None]:
df.dtypes

In [None]:
df['Other_Race'].value_counts()

In [None]:
df['CPSC_Case_Number'] = pd.to_numeric(df['CPSC_Case_Number'], errors='coerce')

In [None]:
df[df['CPSC_Case_Number'].isna()] 

In [None]:
df['Treatment_Date'] = pd.to_datetime(df['Treatment_Date'], errors='coerce')
df[df['Treatment_Date'].isna()]

In [None]:
df = df.dropna(subset=['Treatment_Date'])

In [None]:
df.shape

In [None]:
cn_td = ['CPSC_Case_Number', 'Treatment_Date']
cn_td_a = cn_td + ['Age']
monthf = lambda x: dt.datetime.strftime(x[1], '%Y-%m')
df1 = (
    df.loc[:, cn_td_a] .set_index(cn_td).groupby(monthf).count()
).rename(columns={'Age': 'Count'}).sort_values('Count', ascending=False)
df1.iloc[[0, -1]]
                

In [None]:
df = df[df['Age'] < 150]
df.hist('Age')

In [None]:
df['Age'].max()

In [None]:
df.columns

In [None]:
avocado_related = df[df['Narrative_1'].str.contains('AVOCADO')]
avocado_related.loc[:, ['PSU', 'Sex']].groupby('Sex').count()

In [None]:
df['Sex']

In [None]:
df['Sex'].value_counts()

In [None]:
print(avocado_related.shape)
print(avocado_related.query('Age < 18').shape)
print(avocado_related.query('Age >= 18').shape)

In [None]:
month_counts = (
    avocado_related.loc[:, cn_td_a].set_index(cn_td).groupby(monthf).count()
                   .rename(columns={'Age': 'Count'})
)
month_counts.sort_values('Count', ascending=False)

In [None]:
month_counts.index

In [None]:
month_counts.reset_index()

In [None]:
avocado_related_by_month = avocado_related.loc[:, cn_td_a].assign(
    month=avocado_related['Treatment_Date'].dt.month,
    year=avocado_related['Treatment_Date'].dt.year
).drop(['CPSC_Case_Number', 'Treatment_Date'],axis=1).groupby(['month', 'year']).count().unstack(level=1)
avocado_related_by_month.columns = list(range(2020,  2024))
avocado_related_by_month

In [None]:
_ = avocado_related_by_month.plot(kind='bar', stacked=True)

In [None]:
print(avocado_related['Narrative_1'])

In [None]:
avocado_related.columns

In [None]:
bdypt_vc = avocado_related.loc[:, 'Body_Part'].value_counts()


In [None]:
aux = pd.read_excel('data/neiss2023.xlsx', sheet_name='NEISS_FMT')
aux = (
    aux.loc[aux['Format name'] == 'BDYPT']['Format value label'].str.split(' - ', expand=True)
)
aux.iloc[:, 0] = pd.to_numeric(aux.iloc[:, 0])
bdypt_dict = aux.set_index(0).to_dict()[1]
avocado_related['Body_Part'].map(bdypt_dict).value_counts()

In [178]:
wordlists = avocado_related['Narrative_1'].str.split(r'\W+', regex=True)
Counter([x for wl in wordlists for x in wl])


Counter({'AVOCADO': 605,
         'DX': 595,
         'FINGER': 581,
         'KNIFE': 534,
         'LACERATION': 522,
         'HAND': 520,
         'CUTTING': 442,
         'AN': 399,
         'WITH': 368,
         'CUT': 361,
         'OF': 355,
         'LEFT': 347,
         'TO': 345,
         'A': 324,
         'AND': 311,
         'THE': 277,
         'LAC': 269,
         'WAS': 258,
         'YOF': 183,
         'HER': 172,
         'WHEN': 157,
         'WHILE': 150,
         'SLIPPED': 149,
         'INDEX': 122,
         'PIT': 110,
         'LT': 107,
         '': 105,
         'SHE': 91,
         'PT': 88,
         'PALM': 88,
         'TRYING': 87,
         'L': 79,
         'WITHOUT': 78,
         'ACCIDENTALLY': 77,
         'ON': 73,
         'PRESENTS': 69,
         'THUMB': 65,
         'AT': 64,
         'HOME': 60,
         'BODY': 59,
         'OUT': 58,
         'HIS': 58,
         'MIDDLE': 57,
         'FOREIGN': 56,
         'O': 55,
         'W': 54,
       