# Preprocess Combined Data
Using the combined QBJ as input, perform data preprocessing including:

1. Expand Contractions, Tokenize, and Convert to Lowercase
2. Remove Punctuation
3. Remove Stop Words
4. Remove Words Starting with a Digit
5. Parts of Speech (POS) Tagging
6. Lemmatize
7. Stemming
8. Create Bag of Words (BOW)
9. Calculate Term Frequency
10. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)
11. Sentencize
     1. Lemmatize Sentences
     1. Stem Sentences


In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
# Readthe combined data into a dataframe
data_file = "./data/stress_urinary_incontinence.csv"

# Read the data into a pandas dataframe
df = pd.read_csv(
    data_file,  # The data file being read, from the variable assignment above
    on_bad_lines="warn",  # This tells Pandas to only warn on bad lines vs causing an error
    dtype="str",
)  # This tells Pandas to treat all numbers as words

df.fillna("", inplace=True)

In [3]:
df.shape

(9605, 37)

In [4]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,MDR_REPORT_KEY,MDR_TEXT_KEY,TEXT_TYPE_CODE,PATIENT_SEQUENCE_NUMBER,DATE_REPORT,FOI_TEXT,DEVICE_EVENT_KEY,IMPLANT_FLAG,DATE_REMOVED_FLAG,...,LOT_NUMBER,OTHER_ID_NUMBER,DEVICE_AVAILABILITY,DATE_RETURNED_TO_MANUFACTURER,DEVICE_REPORT_PRODUCT_CODE,DEVICE_AGE_TEXT,DEVICE_EVALUATED_BY_MANUFACTUR,COMBINATION_PRODUCT_FLAG,UDI-DI,UDI-PUBLIC
0,106741,6383024,106903842,N,1,,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...,,,,...,,,N,,OTN,DA,N,N,,
1,106742,6383024,106903843,D,1,,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...,,,,...,,,N,,OTN,DA,N,N,,


## Assign a Row ID for Verification
Assign a value to a variable that identifies a row from the dataset.  

This will allow the same row to be used for verification of each preprocessing step.

In [7]:
verification_row = 9

## Load the Natural Language Toolkit (NLTK) and Preprocessing Libraries

In [5]:
# Import the NLTK library
import nltk  # If this step fails, rerun 07-Install-NLTK.ipynb
import string
import contractions

## 1. Expand Contractions, Tokenize, and Convert to Lowercase

In [6]:
# This approach takes the FOI_TEXT as a string and creates a new column with tokens
# It removes contractions _and_ tokenizes at the same time
# No additional function is needed, x.split tokenizes the string (FOI text) at every space
# A call to lower() converts the word to lowercase

df["TOKENIZED_TEXT"] = df["FOI_TEXT"].apply(
    lambda x: [contractions.fix(word).lower() for word in x.split()]
)
df["TOKENIZED_TEXT"].head()

0    [based, on, additional, information, received,...
1    [based, on, additional, information, received,...
2    [if, information, is, provided, in, the, futur...
3    [manufacturer, reference, number:, (b)(4)., in...
4    [the, patient's, attorney, alleged, a, deficie...
Name: TOKENIZED_TEXT, dtype: object

In [8]:
df["TOKENIZED_TEXT"][verification_row]

['manufacturer',
 'reference',
 'number:',
 '(b)(4).',
 'incident',
 'date',
 'was',
 'not',
 'provided.',
 'lot',
 'number',
 'not',
 'provided.',
 'udi',
 'not',
 'provided',
 '.',
 're-processing',
 'information',
 'not',
 'provided.',
 'since',
 'the',
 'lot',
 'number',
 'was',
 'not',
 'provided,',
 'this',
 'information',
 'cannot',
 'be',
 'determined.']

## 2. Remove Punctuation

In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Define a function to remove punctuation in the data
def remove_punctuation(text):
    text = "".join(
        [character for character in text if character not in string.punctuation]
    )
    return text


df["NOPUNCT_TEXT"] = df["TOKENIZED_TEXT"].apply(
    lambda x: [remove_punctuation(word) for word in x]
)
df["NOPUNCT_TEXT"].head()

0    [based, on, additional, information, received,...
1    [based, on, additional, information, received,...
2    [if, information, is, provided, in, the, futur...
3    [manufacturer, reference, number, b4, incident...
4    [the, patients, attorney, alleged, a, deficien...
Name: NOPUNCT_TEXT, dtype: object

In [11]:
df["NOPUNCT_TEXT"][verification_row]

['manufacturer',
 'reference',
 'number',
 'b4',
 'incident',
 'date',
 'was',
 'not',
 'provided',
 'lot',
 'number',
 'not',
 'provided',
 'udi',
 'not',
 'provided',
 '',
 'reprocessing',
 'information',
 'not',
 'provided',
 'since',
 'the',
 'lot',
 'number',
 'was',
 'not',
 'provided',
 'this',
 'information',
 'cannot',
 'be',
 'determined']

## 3. Remove Stop Words

In [12]:
stopwords = nltk.corpus.stopwords.words("english")


# Define a function to convert to lowercase and remove stopwords
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word.lower() not in stopwords]
    return text


df["NOSTOPWORDS_TEXT"] = df["NOPUNCT_TEXT"].apply(lambda x: remove_stopwords(x))
df["NOSTOPWORDS_TEXT"].head()

0    [based, additional, information, received, com...
1    [based, additional, information, received, com...
2    [information, provided, future, supplemental, ...
3    [manufacturer, reference, number, b4, incident...
4    [patients, attorney, alleged, deficiency, devi...
Name: NOSTOPWORDS_TEXT, dtype: object

In [13]:
df["NOSTOPWORDS_TEXT"][verification_row]

['manufacturer',
 'reference',
 'number',
 'b4',
 'incident',
 'date',
 'provided',
 'lot',
 'number',
 'provided',
 'udi',
 'provided',
 '',
 'reprocessing',
 'information',
 'provided',
 'since',
 'lot',
 'number',
 'provided',
 'information',
 'cannot',
 'determined']

## 4. Remove Words Starting with a Digit

In [14]:
import re

# create a regular expression pattern to match words that start with numbers
pattern = re.compile(r"^\d+")


# Define a function to convert to lowercase and remove stopwords
def remove_stopwords(tokens):
    text = [word for word in tokens if not pattern.match(word)]
    return text


df["NODIGITS_TEXT"] = df["NOSTOPWORDS_TEXT"].apply(lambda x: remove_stopwords(x))
df["NODIGITS_TEXT"].head()

0    [based, additional, information, received, com...
1    [based, additional, information, received, com...
2    [information, provided, future, supplemental, ...
3    [manufacturer, reference, number, b4, incident...
4    [patients, attorney, alleged, deficiency, devi...
Name: NODIGITS_TEXT, dtype: object

In [15]:
df["NODIGITS_TEXT"][verification_row]

['manufacturer',
 'reference',
 'number',
 'b4',
 'incident',
 'date',
 'provided',
 'lot',
 'number',
 'provided',
 'udi',
 'provided',
 '',
 'reprocessing',
 'information',
 'provided',
 'since',
 'lot',
 'number',
 'provided',
 'information',
 'cannot',
 'determined']

## X. Word Frequency Table

In [16]:
# Explode the lists into separate rows
exploded_df = df.explode("NODIGITS_TEXT")
word_freq = exploded_df["NODIGITS_TEXT"].value_counts()

# Create a DataFrame from the word frequency data
freq_df = pd.DataFrame({"Word": word_freq.index, "Frequency": word_freq.values})
freq_df

Unnamed: 0,Word,Frequency
0,patient,30407
1,pain,19251
2,b6,17037
3,device,16064
4,reported,12668
...,...,...
13330,payers,1
13331,submucosally,1
13332,maneuvered,1
13333,deviating,1


## 5. Parts of Speech (POS) Tagging

In [17]:
# Apply the nltk.pos_tag() function to each row of the TOKENIZED_TEXT column
# pos_tag returns a Tuple for each word consisting of the word and its classification
# TODO: List classifications and their abbreviations
df["POS_TEXT"] = df["NODIGITS_TEXT"].apply(nltk.pos_tag)
df["POS_TEXT"].head()

0    [(based, VBN), (additional, JJ), (information,...
1    [(based, VBN), (additional, JJ), (information,...
2    [(information, NN), (provided, VBD), (future, ...
3    [(manufacturer, NN), (reference, NN), (number,...
4    [(patients, NNS), (attorney, NN), (alleged, VB...
Name: POS_TEXT, dtype: object

In [19]:
df["POS_TEXT"][10]

[('patients', 'NNS'),
 ('attorney', 'NN'),
 ('alleged', 'VBN'),
 ('deficiency', 'NN'),
 ('device', 'NN'),
 ('resulting', 'VBG'),
 ('unspecified', 'JJ'),
 ('adverse', 'JJ'),
 ('outcome', 'NN'),
 ('product', 'NN'),
 ('used', 'VBN'),
 ('therapeutic', 'JJ'),
 ('treatment', 'NN'),
 ('preoperative', 'JJ'),
 ('postoperative', 'JJ'),
 ('diagnosis', 'NN'),
 ('stress', 'NN'),
 ('urinary', 'JJ'),
 ('incontinence', 'NN'),
 ('procedure', 'NN'),
 ('performed', 'VBD'),
 ('transvaginal', 'JJ'),
 ('sling', 'NN'),
 ('placement', 'NN'),
 ('patient', 'NN'),
 ('returned', 'VBD'),
 ('office', 'NN'),
 ('visit', 'NN'),
 ('b6', 'NN'),
 ('patient', 'NN'),
 ('complained', 'VBD'),
 ('incontinence', 'NN'),
 ('stream', 'NN'),
 ('started', 'VBD'),
 ('strong', 'JJ'),
 ('began', 'VBD'),
 ('weaken', 'JJ'),
 ('stated', 'VBN'),
 ('would', 'MD'),
 ('stop', 'VB'),
 ('start', 'NN'),
 ('stood', 'VBD'),
 ('voiding', 'VBG'),
 ('began', 'VBD'),
 ('leak', 'JJ'),
 ('immediately', 'RB'),
 ('constantly', 'RB'),
 ('wear', 'JJ'),
 ('

## 6. Lemmatization

In [20]:
from nltk.stem import WordNetLemmatizer


# define a function to lemmatize each word in a text list based on its POS tag
def lemmatize_text(pos_tagged_text):
    # initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    # map NLTK's POS tags to WordNet's POS tags
    # TODO: list the abbreviations for WordNet's parts of speech
    pos_map = {"N": "n", "V": "v", "R": "r", "J": "a"}

    # lemmatize each word in the text list based on its POS tag
    lemmatized_text = []

    for word, pos in pos_tagged_text:
        # get the first character of the POS tag to use as the WordNet POS tag
        #
        # Set the WordNetLemmatizer default to Nouns ('n') or Verbs ('v')
        #
        wn_pos = pos_map.get(pos[0], "n")

        # lemmatize the word and append it to the lemmatized text list
        lemmatized_word = lemmatizer.lemmatize(word, pos=wn_pos)
        lemmatized_text.append(lemmatized_word)

    # return the lemmatized text list
    return lemmatized_text


# apply the lemmatize_text function to each row of the dataframe
df["LEMMATIZED_TEXT"] = df["POS_TEXT"].apply(lemmatize_text)
df["LEMMATIZED_TEXT"].head()

0    [base, additional, information, receive, compl...
1    [base, additional, information, receive, compl...
2    [information, provide, future, supplemental, r...
3    [manufacturer, reference, number, b4, incident...
4    [patient, attorney, allege, deficiency, device...
Name: LEMMATIZED_TEXT, dtype: object

## 7. Stemming

In [23]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


# define a function to stem each word in a text list
def stem_words(pos_tagged_text):
    stemmer = PorterStemmer()

    stemmed_text = []

    for word, pos in pos_tagged_text:
        # stem the word and append it to the stemmed text list
        stemmed_word = stemmer.stem(word)
        stemmed_text.append(stemmed_word)

    # return the stemmed text list
    return stemmed_text


df["STEMMED_TEXT"] = df["POS_TEXT"].apply(stem_words)
df["STEMMED_TEXT"].head()

0    [base, addit, inform, receiv, complaint, medtr...
1    [base, addit, inform, receiv, complaint, medtr...
2    [inform, provid, futur, supplement, report, issu]
3    [manufactur, refer, number, b4, incid, date, p...
4    [patient, attorney, alleg, defici, devic, resu...
Name: STEMMED_TEXT, dtype: object

## Compare the results of lemmatization and stemming

In [24]:
compare_lemma_stem_df = pd.DataFrame(
    {
        "WORD, PART OF SPEECH": df["POS_TEXT"][verification_row],
        "LEMMA": df["LEMMATIZED_TEXT"][verification_row],
        "STEM": df["STEMMED_TEXT"][verification_row],
    }
)

compare_lemma_stem_df = compare_lemma_stem_df.style.set_properties(
    **{"text-align": "left"}
)
compare_lemma_stem_df = compare_lemma_stem_df.set_table_styles(
    [dict(selector="th", props=[("text-align", "left")])]
)
compare_lemma_stem_df

Unnamed: 0,"WORD, PART OF SPEECH",LEMMA,STEM
0,"('manufacturer', 'NN')",manufacturer,manufactur
1,"('reference', 'NN')",reference,refer
2,"('number', 'NN')",number,number
3,"('b4', 'JJ')",b4,b4
4,"('incident', 'JJ')",incident,incid
5,"('date', 'NN')",date,date
6,"('provided', 'VBD')",provide,provid
7,"('lot', 'NN')",lot,lot
8,"('number', 'NN')",number,number
9,"('provided', 'VBD')",provide,provid


## 8. Create Bag of Words (BOW)

In [25]:
## 7. Create Bag of Words (BOW)
from sklearn.feature_extraction.text import CountVectorizer

# create a CountVectorizer object
count_vectorizer = CountVectorizer()

# fit the vectorizer to the text data
count_vectorizer.fit(df["LEMMATIZED_TEXT"].apply(lambda x: " ".join(x)))

# create a bag of words matrix
bow_matrix = count_vectorizer.transform(
    df["LEMMATIZED_TEXT"].apply(lambda x: " ".join(x))
)

# convert the bag of words matrix to a DataFrame
bow_df = pd.DataFrame(
    bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out()
)

In [26]:
bow_df.shape

(9605, 11671)

In [25]:
bow_df.head()
# TODO: Plot the BOW results (?)

Unnamed: 0,abbott,abdomen,abdominal,aberration,able,accessory,accuracy,accurate,acetaminophen,actually,...,work,would,x2,xray,year,yellow,yes,yet,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 9. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)

In [27]:
## 8. Calculate Term Frequency-Inverse Document Frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# create a CountVectorizer object and fit it to the text data
tfidf_vectorizer = TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(df["LEMMATIZED_TEXT"].apply(lambda x: " ".join(x)))

# convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [28]:
tfidf_df.shape

(9605, 11671)

In [29]:
tfidf_df.head(2)

Unnamed: 0,a020503,a04,a040609,a0414,a050,a0501,a1,a1502,a150201,a150205,...,zoloftnow,zone,zoref,zoster,zosyn,zsi,zuban,zyprexa,zyrtec,zyson
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 10. Sentencize
The `FOI_TEXT` can be processed as sentences.

For further analysis, each sentence needs to be associated with the `FOI_TEXT` row that it came from.

[This discussion from Stack Overflow](https://stackoverflow.com/a/43922444/2308522) provides a suggestion for breaking the code into a dataframe of sentences with each sentence retaining the ID of the row where it was originally located.

[This page from the Pandas documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.itertuples.html) provides details on using the `itertuples()` function to process the rows of the dataframe.

In [45]:
for row in df.itertuples():
    print(row[7])
    break

BASED ON ADDITIONAL INFORMATION RECEIVED THIS COMPLAINT IS NOT A MEDTRONIC PRODUCT. IF INFORMATION IS PROVIDED IN THE FUTURE, A SUPPLEMENTAL REPORT WILL BE ISSUED.


In [46]:
sentences = []

# Using itertuples(), the columns must be addressed using thier position.
# Here's a map of position to name:
# row[1]: ROW_ID
# row[2]: FOI_TEXT
# row[3]: DEVICE_PROBLEM_CODE
# row[4]: DEVICE_PROBLEM_TEXT
for row in df.itertuples():
    for sentence in row[7].split("."):
        if sentence != "":
            sentences.append([row[1], row[3], row[4], sentence])

sentences_df = pd.DataFrame(
    sentences,
    columns=[
        "ROW_ID",
        "DEVICE_PROBLEM_CODE",
        "DEVICE_PROBLEM_TEXT",
        "SENTENCIZED_FOI_TEXT",
    ],
)

compare_lemma_stem_df

Unnamed: 0,"WORD, PART OF SPEECH",LEMMA,STEM
0,"('manufacturer', 'NN')",manufacturer,manufactur
1,"('reference', 'NN')",reference,refer
2,"('number', 'NN')",number,number
3,"('b4', 'JJ')",b4,b4
4,"('incident', 'JJ')",incident,incid
5,"('date', 'NN')",date,date
6,"('provided', 'VBD')",provide,provid
7,"('lot', 'NN')",lot,lot
8,"('number', 'NN')",number,number
9,"('provided', 'VBD')",provide,provid


In [47]:
sentences_df.shape

(95712, 4)

In [48]:
sentences_df.head(3)

Unnamed: 0,ROW_ID,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,SENTENCIZED_FOI_TEXT
0,106741,106903842,N,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...
1,106741,106903842,N,"IF INFORMATION IS PROVIDED IN THE FUTURE, A S..."
2,106742,106903843,D,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...


In [49]:
sentences_df["SENTENCIZED_FOI_TEXT"][0]

'BASED ON ADDITIONAL INFORMATION RECEIVED THIS COMPLAINT IS NOT A MEDTRONIC PRODUCT'

In [50]:
# Expand Contractions, Tokenize, and Convert to Lowercase
sentences_df["TOKENIZED_SENTENCES"] = sentences_df["SENTENCIZED_FOI_TEXT"].apply(
    lambda x: [contractions.fix(word).lower() for word in x.split()]
)

In [51]:
sentences_df["TOKENIZED_SENTENCES"][0]

['based',
 'on',
 'additional',
 'information',
 'received',
 'this',
 'complaint',
 'is',
 'not',
 'a',
 'medtronic',
 'product']

In [52]:
# Remove punctuation
sentences_df["NOPUNCT_SENTENCES"] = sentences_df["TOKENIZED_SENTENCES"].apply(
    lambda x: [remove_punctuation(word) for word in x]
)
sentences_df["NOPUNCT_SENTENCES"][0]

['based',
 'on',
 'additional',
 'information',
 'received',
 'this',
 'complaint',
 'is',
 'not',
 'a',
 'medtronic',
 'product']

In [53]:
# Remove stop words
sentences_df["NOSTOPWORDS_SENTENCES"] = sentences_df["NOPUNCT_SENTENCES"].apply(
    lambda x: remove_stopwords(x)
)
sentences_df["NOSTOPWORDS_SENTENCES"][0]

['based',
 'on',
 'additional',
 'information',
 'received',
 'this',
 'complaint',
 'is',
 'not',
 'a',
 'medtronic',
 'product']

In [54]:
# Apply POS Tagging
sentences_df["POS_SENTENCES"] = sentences_df["NOSTOPWORDS_SENTENCES"].apply(
    nltk.pos_tag
)
sentences_df["POS_SENTENCES"][0]

[('based', 'VBN'),
 ('on', 'IN'),
 ('additional', 'JJ'),
 ('information', 'NN'),
 ('received', 'VBD'),
 ('this', 'DT'),
 ('complaint', 'NN'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('a', 'DT'),
 ('medtronic', 'JJ'),
 ('product', 'NN')]

In [55]:
# Define a function to join tokens that have been lemmatized and stemmed
def join_tokenized_sentence(tokens):
    joined_words = []

    for word in tokens:
        joined_words.append(word)

    # Join the stemmed words back into a sentence
    return " ".join(joined_words)

### 10.A Lemmatize Sentences

In [56]:
sentences_df["TOKEN_LEMMATIZED_SENTENCES"] = sentences_df["POS_SENTENCES"].apply(
    lemmatize_text
)
sentences_df["TOKEN_LEMMATIZED_SENTENCES"][0]

['base',
 'on',
 'additional',
 'information',
 'receive',
 'this',
 'complaint',
 'be',
 'not',
 'a',
 'medtronic',
 'product']

In [57]:
sentences_df["LEMMATIZED_SENTENCES"] = sentences_df["TOKEN_LEMMATIZED_SENTENCES"].apply(
    join_tokenized_sentence
)
sentences_df["LEMMATIZED_SENTENCES"][0]

'base on additional information receive this complaint be not a medtronic product'

### 10.B Stem Sentences

In [58]:
# create a new column called 'STEMMED_SENTENCES'
sentences_df["TOKEN_STEMMED_SENTENCES"] = sentences_df["POS_SENTENCES"].apply(
    stem_words
)
sentences_df["TOKEN_STEMMED_SENTENCES"][0]

['base',
 'on',
 'addit',
 'inform',
 'receiv',
 'thi',
 'complaint',
 'is',
 'not',
 'a',
 'medtron',
 'product']

In [59]:
sentences_df["STEMMED_SENTENCES"] = sentences_df["TOKEN_STEMMED_SENTENCES"].apply(
    join_tokenized_sentence
)
sentences_df["STEMMED_SENTENCES"][0]

'base on addit inform receiv thi complaint is not a medtron product'

## Review the preprocessed data

In [60]:
# Create a new dataframe with just one row containing the column names
column_names_df = pd.DataFrame(
    {
        "DF COLUMN NAMES": df.columns,
    }
)

example = []

for col in df.columns:
    example.append(df[col][0])

column_names_df["EXAMPLE"] = example
column_names_df

Unnamed: 0,DF COLUMN NAMES,EXAMPLE
0,Unnamed: 0,106741
1,MDR_REPORT_KEY,6383024
2,MDR_TEXT_KEY,106903842
3,TEXT_TYPE_CODE,N
4,PATIENT_SEQUENCE_NUMBER,1
5,DATE_REPORT,
6,FOI_TEXT,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...
7,DEVICE_EVENT_KEY,
8,IMPLANT_FLAG,
9,DATE_REMOVED_FLAG,


In [61]:
# Create a new dataframe with just one row containing the column names
column_names_df = pd.DataFrame(
    {
        "SENTENCES DF COLUMN NAMES": sentences_df.columns,
    }
)

example = []

for col in sentences_df.columns:
    example.append(sentences_df[col][0])

column_names_df["EXAMPLE"] = example
column_names_df

Unnamed: 0,SENTENCES DF COLUMN NAMES,EXAMPLE
0,ROW_ID,106741
1,DEVICE_PROBLEM_CODE,106903842
2,DEVICE_PROBLEM_TEXT,N
3,SENTENCIZED_FOI_TEXT,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...
4,TOKENIZED_SENTENCES,"[based, on, additional, information, received,..."
5,NOPUNCT_SENTENCES,"[based, on, additional, information, received,..."
6,NOSTOPWORDS_SENTENCES,"[based, on, additional, information, received,..."
7,POS_SENTENCES,"[(based, VBN), (on, IN), (additional, JJ), (in..."
8,TOKEN_LEMMATIZED_SENTENCES,"[base, on, additional, information, receive, t..."
9,LEMMATIZED_SENTENCES,base on additional information receive this co...


## Save the preproecssed data

In [62]:
df.to_csv(f"./data/preprocessed_data.csv", index=False)

bow_df.to_csv(f"./data/bag_of_words_data.csv", index=False)

tfidf_df.to_csv(f"./data/tfidf_data.csv", index=False)

sentences_df.to_csv(f"./data/sentences_data.csv", index=False)

## Upload All Output to an S3 Bucket

In [46]:
import os
import subprocess

# Create the upload command using the AWS command line interface
command = [
    "aws",
    "s3",
    "sync",
    working_directory,
    f"s3://praxis-2023-html-output/",
    "--exclude",
    f"*/.ipynb_checkpoints/*",
    "--no-progress",
]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the command's output
print(output.stdout)

upload: 21-Preprocess-Combined-Data-v2/preprocessed_data.csv to s3://praxis-2023-html-output/preprocessed_data.csv
upload: 21-Preprocess-Combined-Data-v2/dataframe.pickle to s3://praxis-2023-html-output/dataframe.pickle
upload: 21-Preprocess-Combined-Data-v2/sentences_data.csv to s3://praxis-2023-html-output/sentences_data.csv
upload: 21-Preprocess-Combined-Data-v2/tfidf_data.csv to s3://praxis-2023-html-output/tfidf_data.csv
upload: 21-Preprocess-Combined-Data-v2/bag_of_words_data.csv to s3://praxis-2023-html-output/bag_of_words_data.csv



In [48]:
import pickle

# Assume `df` is the dataframe you want to save
with open(f"{working_directory}/dataframe.pickle", "wb") as f:
    pickle.dump(df, f)

In [49]:
print("fin")

fin
