# Preprocessing US mining dataset

* Clean the dataset.
* Subseting dataset with meaningful columns.
* Decide which code on injury is Major or Minor for the classification task.
* Spliting dataset into traning and testing set.

In [None]:
import pandas as pd

# Load CSV file into a pandas DataFrame
df = pd.read_csv('us_data_2000.csv')


In [None]:
print(list(df.columns))


['MINE_ID', 'CONTROLLER_ID', 'CONTROLLER_NAME', 'OPERATOR_ID', 'OPERATOR_NAME', 'CONTRACTOR_ID', 'DOCUMENT_NO', 'SUBUNIT_CD', 'SUBUNIT', 'ACCIDENT_DT', 'CAL_YR', 'CAL_QTR', 'FISCAL_YR', 'FISCAL_QTR', 'ACCIDENT_TIME', 'DEGREE_INJURY_CD', 'DEGREE_INJURY', 'FIPS_STATE_CD', 'UG_LOCATION_CD', 'UG_LOCATION', 'UG_MINING_METHOD_CD', 'UG_MINING_METHOD', 'MINING_EQUIP_CD', 'MINING_EQUIP', 'EQUIP_MFR_CD', 'EQUIP_MFR_NAME', 'EQUIP_MODEL_NO', 'SHIFT_BEGIN_TIME', 'CLASSIFICATION_CD', 'CLASSIFICATION', 'ACCIDENT_TYPE_CD', 'ACCIDENT_TYPE', 'NO_INJURIES', 'TOT_EXPER', 'MINE_EXPER', 'JOB_EXPER', 'OCCUPATION_CD', 'OCCUPATION', 'ACTIVITY_CD', 'ACTIVITY', 'INJURY_SOURCE_CD', 'INJURY_SOURCE', 'NATURE_INJURY_CD', 'NATURE_INJURY', 'INJ_BODY_PART_CD', 'INJ_BODY_PART', 'SCHEDULE_CHARGE', 'DAYS_RESTRICT', 'DAYS_LOST', 'TRANS_TERM', 'RETURN_TO_WORK_DT', 'IMMED_NOTIFY_CD', 'IMMED_NOTIFY', 'INVEST_BEGIN_DT', 'NARRATIVE', 'CLOSED_DOC_NO', 'COAL_METAL_IND']


### Subsetting the data with the meaningful columns (Narrative, DEGREE_INJURY_CD, DEGREE_INJURY)

In [None]:
data=df[["NARRATIVE","DEGREE_INJURY_CD","DEGREE_INJURY"]]

In [None]:
data.head()

Unnamed: 0,NARRATIVE,DEGREE_INJURY_CD,DEGREE_INJURY
0,Employee was cleaning up at the Primary Crushe...,5,DAYS RESTRICTED ACTIVITY ONLY
1,Handle of sledgehammer broke and head of hamme...,6,"NO DYS AWY FRM WRK,NO RSTR ACT"
2,EMPLOYEE WAS CLIMBING DOWN A LADDER AND WHEN H...,3,DAYS AWAY FROM WORK ONLY
3,HE PULLED A BACK MUSCLE WHILE STACKING BAGS OF...,5,DAYS RESTRICTED ACTIVITY ONLY
4,EE hands began to break out in a rash after he...,5,DAYS RESTRICTED ACTIVITY ONLY


In [None]:
data[['DEGREE_INJURY_CD','DEGREE_INJURY']].value_counts()


DEGREE_INJURY_CD  DEGREE_INJURY                 
3                 DAYS AWAY FROM WORK ONLY          595
6                 NO DYS AWY FRM WRK,NO RSTR ACT    552
5                 DAYS RESTRICTED ACTIVITY ONLY     359
0                 ACCIDENT ONLY                     219
4                 DYS AWY FRM WRK & RESTRCTD ACT    145
7                 OCCUPATNAL ILLNESS NOT DEG 1-6     57
10                ALL OTHER CASES (INCL 1ST AID)     20
2                 PERM TOT OR PERM PRTL DISABLTY     18
1                 FATALITY                           11
?                 NO VALUE FOUND                     11
8                 INJURIES DUE TO NATURAL CAUSES     10
9                 INJURIES INVOLVNG NONEMPLOYEES      3
dtype: int64

**There is a consistency between codes and descriptions to define the seriousness of injuries.**

### There are 11 unknown observations in the data frame, then deleting that rows.

In [None]:
df = data[~data['DEGREE_INJURY_CD'].isin(['?'])]

In [None]:
# Double check after cleaning
df[['DEGREE_INJURY_CD','DEGREE_INJURY']].value_counts()

DEGREE_INJURY_CD  DEGREE_INJURY                 
3                 DAYS AWAY FROM WORK ONLY          595
6                 NO DYS AWY FRM WRK,NO RSTR ACT    552
5                 DAYS RESTRICTED ACTIVITY ONLY     359
0                 ACCIDENT ONLY                     219
4                 DYS AWY FRM WRK & RESTRCTD ACT    145
7                 OCCUPATNAL ILLNESS NOT DEG 1-6     57
10                ALL OTHER CASES (INCL 1ST AID)     20
2                 PERM TOT OR PERM PRTL DISABLTY     18
1                 FATALITY                           11
8                 INJURIES DUE TO NATURAL CAUSES     10
9                 INJURIES INVOLVNG NONEMPLOYEES      3
dtype: int64

**From the table, I chose 1,2,3,4,5 in DEGREE_INJURY_CD  to be Major injuries. The remaining cases belong to minor ones.**

In [None]:
print(df['DEGREE_INJURY_CD'].dtype)

object


In [None]:
#Convert DEGREE_INJURY_CD from object to integer
df['DEGREE_INJURY_CD'] = pd.to_numeric(df['DEGREE_INJURY_CD'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DEGREE_INJURY_CD'] = pd.to_numeric(df['DEGREE_INJURY_CD'])


In [None]:

# Add a new column 'category' and assign values based on DEGREE_INJURY_CD

df.loc[df['DEGREE_INJURY_CD'].isin([1, 2, 3, 4, 5]), 'injury_level'] = 'Major'
df.loc[df['DEGREE_INJURY_CD'].isin([0, 6, 7, 8, 9, 10]), 'injury_level'] = 'Minor'

# Print the resulting dataframe
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['DEGREE_INJURY_CD'].isin([1, 2, 3, 4, 5]), 'injury_level'] = 'Major'


Unnamed: 0,NARRATIVE,DEGREE_INJURY_CD,DEGREE_INJURY,injury_level
0,Employee was cleaning up at the Primary Crushe...,5,DAYS RESTRICTED ACTIVITY ONLY,Major
1,Handle of sledgehammer broke and head of hamme...,6,"NO DYS AWY FRM WRK,NO RSTR ACT",Minor
2,EMPLOYEE WAS CLIMBING DOWN A LADDER AND WHEN H...,3,DAYS AWAY FROM WORK ONLY,Major
3,HE PULLED A BACK MUSCLE WHILE STACKING BAGS OF...,5,DAYS RESTRICTED ACTIVITY ONLY,Major
4,EE hands began to break out in a rash after he...,5,DAYS RESTRICTED ACTIVITY ONLY,Major


## Refining the dataframe with 2 columns ( NARRATIVE and injury_level)

In [None]:
df=df[['NARRATIVE','injury_level']]
df.head()

Unnamed: 0,NARRATIVE,injury_level
0,Employee was cleaning up at the Primary Crushe...,Major
1,Handle of sledgehammer broke and head of hamme...,Minor
2,EMPLOYEE WAS CLIMBING DOWN A LADDER AND WHEN H...,Major
3,HE PULLED A BACK MUSCLE WHILE STACKING BAGS OF...,Major
4,EE hands began to break out in a rash after he...,Major


# Preprocessing unstructured texts

In [None]:
# Importing libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def text_transform(text):
    
    # Tokenize the text into words
    words = word_tokenize(text.lower())

    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Convert list of words back into a string
    processed_text = ' '.join(words)

    return processed_text

In [None]:
list_new_text = []
for i in df['NARRATIVE']:
    new_text=text_transform(i)
    list_new_text.append(new_text)


In [None]:
df['NARRATIVE']=list_new_text

In [None]:
df.head()

Unnamed: 0,NARRATIVE,injury_level
0,employee cleaning primary crusher dingo skid s...,Major
1,handle sledgehammer broke head hammer hit empl...,Minor
2,employee climbing ladder stepped ground slippe...,Major
3,pulled back muscle stacking bag material,Major
4,ee hand began break rash handled material coat...,Major


In [None]:
# After preprocessing, the dataset is more tidy and streamlined.

# Spliting the dataset into training, validation and test set

In [None]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Split the training set into training and validation sets
train, validation = train_test_split(train, test_size=0.2, random_state=42)


# Label the rows of each split
train["split"] = "train"
validation["split"] = "validation"
test["split"] = "test"

# Concatenate the splits back into one DataFrame
split_df = pd.concat([train, validation, test])


train         1272
test           398
validation     319
Name: split, dtype: int64

In [None]:
# Double check the result after spliting
split_df['split'].value_counts()

Saving the dataset for the next task

In [None]:
split_df.to_csv('binary_injury_data.csv', index=False)
