In [None]:
# !python3 -m pip install langdetect


# Feature Engineering of corpus

The following resources were found helpful devising a strategy for feature engineering:

- [Text Analysis & Feature Engineering with NLP](https://towardsdatascience.com/text-analysis-feature-engineering-with-nlp-502d6ea9225d)
- [Topic Modelling and Latent Dirichlet Allocation \(LDA\) in Python](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

## Importing modules &amp; defining globals


In [2]:
import os
import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords

cwd = os.getcwd()
os.chdir("../")
from lda_utils import preprocess

os.chdir(cwd)
# from gensim.utils import simple_preprocess
# from nltk.stem import WordNetLemmatizer

# from typing import Union

np.random.seed(42)

DATA_PATH = "../data/raw"
STOPWORDS = STOPWORDS | frozenset(stopwords.words("english"))


## Loading dataset


In [2]:
io = "Pubmed5k.xlsx"
sheet_name = "random 5k"
df = pd.read_excel(os.path.join(DATA_PATH, io), sheet_name=sheet_name, index_col=[0])
df.head()


Unnamed: 0_level_0,Title,Abstract
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1
34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...


## Exploring dataset


In [41]:
df_lower = df["Abstract"].str.lower()

# some records have no clear abstract
idx_no_abs = df_lower.str.find("no abstract") != -1

df_processed = df[~idx_no_abs]
# df_no_abs = df[idx_no_abs]
# df_no_abs.loc[:, "Abstract"] = ""
# df_processed.loc[idx_no_abs, "Abstract"] = ""

# df_no_abstract = df[idx_no_abs]
# df_corpus = df[~idx_no_abs]


In [42]:
df_len = df_processed.apply({"Title": len, "Abstract": len})
df_len.describe()


Unnamed: 0,Title,Abstract
count,4994.0,4994.0
mean,113.738486,1471.231077
std,36.705224,551.903126
min,13.0,1.0
25%,89.0,1116.0
50%,111.0,1450.0
75%,135.0,1773.0
max,274.0,6635.0


some of the statistics for length of the documents are illogical.
e.g: min(Abstract) = 1


In [43]:
df_len["Abstract"].value_counts().sort_index().head(10)


1      1
4      1
16     1
19     6
43     1
104    1
112    1
114    1
125    1
129    1
Name: Abstract, dtype: int64

In [44]:
df_processed[df_len["Abstract"] <= 104]["Abstract"]


ArticleID
34332620          ClinicalTrials.gov Identifier: NCT03910062.
34506474                                                 N/A.
34669439                                  [Figure: see text].
34669440                                  [Figure: see text].
34669441                                  [Figure: see text].
34669442                                  [Figure: see text].
34669443                                  [Figure: see text].
34669444                                  [Figure: see text].
34156932    At the height of laughter, the universe is flu...
34328127                                     Reply to letter.
34328134                                                    .
Name: Abstract, dtype: object

In [46]:
df_processed[df_len["Abstract"] == 43]["Title"].values


array(['Assessment of bedside lower limb angiography combined with continuous NIRS monitoring for the detection of lower limb complications of VA-ECMO: an observational monocentric study.'],
      dtype=object)

In [50]:
df_len["Title"].value_counts().sort_index().head(10)


13    2
22    1
24    2
25    1
26    3
27    1
28    2
29    2
30    1
31    3
Name: Title, dtype: int64

In [51]:
df_processed[df_len["Title"] <= 31]["Title"]


ArticleID
34223866           Mortality from COVID-19.
34338452    Lyme disease in Poland in 2018.
34477584           The Pain of Infertility.
34477598       The Most Important Question.
34614021         Dreaming of the sleep lab.
34650095                      Wilms tumour.
34728054          The future ocean we want.
34156932    Innovation Is Tied to Optimism.
34177893        Inflammasomes and Fibrosis.
34207126      TMB in NSCLC: A Broken Dream?
34207152      Viral Bad News Sent by EVAIL.
34251623         Biological Network Mining.
34258744     Peptide-Based Antiviral Drugs.
34258745         Covalent Antiviral Agents.
34282564    Neuroinflammation and Tinnitus.
34334469                      Perseverance.
34399409       Fluorescence Guided Surgery.
34444466             Hypothermia in Trauma.
Name: Title, dtype: object

In [52]:
df_altered = df[idx_no_abs]
df_altered = pd.concat([df_altered, df_processed[df_len["Abstract"] <= 43]])
df_altered.loc[:, "Abstract"] = ""
df_altered.shape


(15, 2)

In [53]:
df_altered.head()


Unnamed: 0_level_0,Title,Abstract
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1
34258890,Closing gaps in the care of patients with hear...,
34258891,Too much of a good thing in ischemic mitral: l...,
34258892,COVID-19 infection and cardiometabolic complic...,
34258893,Comments on Cardiovascular effects of waterpip...,
34258894,A case of COVID-19 infection quickly relieved ...,


In [57]:
# df_processed.drop(index=df_altered.index)
df_processed = df.drop(index=df_altered.index)
df_processed = pd.concat([df_processed, df_altered])
df_processed.shape


(4999, 2)

## Preprocessing

> Note: the approaches here are for a unigram model.

after exploring the dataset, and slightly tweaking some features, the two columns of the set are joined to make a single feature (document per record) to be able to operate on the dataset


In [58]:
corpus = df_processed["Title"] + r" " + df_processed["Abstract"]
corpus.name = "document"
corpus.head()


ArticleID
34153941    Stable Coordination Variability in Overground ...
34153942    Weak Hip Strength Increases Dynamic Knee Valgu...
34153964    Current and Future Projections of Amyotrophic ...
34153968    Disparities between Asian and Non-Asian Thromb...
34153978    Maternal Factors Predicting Loss to Follow-Up ...
Name: document, dtype: object

Defining helper function. Basically, bundling the preprocess subroutine into a function

For each document:

1. Tokenising the document.
2. lowercasing the tokens.
3. lemmatising the tokens.
4. dropping stop words.


- Calling the preprocessing subroutine on the dataset


In [63]:
pos = ["a", "n", "r", "s", "v"]
corpus_processed = corpus.apply(preprocess, stopwords=STOPWORDS, pos=pos)
corpus_processed.head()


ArticleID
34153941    [stable, coordination, variability, overground...
34153942    [weak, hip, strength, increase, dynamic, knee,...
34153964    [current, future, projection, amyotrophic, lat...
34153968    [disparity, asian, non, asian, thrombolyzed, a...
34153978    [maternal, factor, predict, loss, follow, newb...
Name: document, dtype: object

## Saving the final feature engineered dataset


In [64]:
corpus_processed.to_csv(os.path.join(DATA_PATH, "corpus_clean.csv"))
