In [24]:
# import libraries
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

nltk.download('stopwords')
from sklearn.preprocessing import LabelEncoder


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\redinger\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Date Cleaning:
Selected articles were indicated, but irrelevant articles were not. 

In [3]:
# Create new column names because text file has no header
col_names = ["Type", "Author", "Year", "Title", "Journal Name", "Volume", "Issue", "Pages", "URL", "Keywords", "Abstract", "DOI", "PDF Name"]

# Convert text files into a dataframe
silver_yes = pd.read_csv('Hypopthalmichthys_molitrix_yes.txt', sep = '\t', header = None, dtype = str, names = col_names, quotechar = '"')
silver_all = pd.read_csv('Hypopthalmichthys_molitrix_all.txt', sep = '\t', header = None, dtype = str, names = col_names, quotechar = '"')
bighead_yes = pd.read_csv('Hypopthalmichthys_nobilis_yes.txt', sep = '\t', header = None, dtype = str, names = col_names, quotechar = '"')
bighead_all = pd.read_csv('Hypopthalmichthys_nobilis_all.txt', sep = '\t', header = None, dtype = str, names = col_names, quotechar = '"')

# Check entries of selected articles
print(len(silver_yes))
print(len(silver_all))
print(len(bighead_yes))
print(len(bighead_all))


123
444
110
197


In [4]:
# Combine data with selected articles
carp_yes = pd.concat([silver_yes, bighead_yes], ignore_index = True)

# Combine all article - duplicates removed in next step to get irrelevant articles
carp_concat = pd.concat([silver_yes, silver_all, bighead_yes, bighead_all], ignore_index = True)

print(len(carp_yes))
print(len(carp_concat))


233
874


In [5]:
# Drop duplicates to get unselected articles
carp_no = carp_concat.drop_duplicates(keep = False, ignore_index = True)

print(len(carp_no))


408


In [6]:
# subset and select by columns
columns = ["Author", "Year", "Title", "Journal Name", "Volume", "Issue", "Pages", "Abstract"]

carp_yes = carp_yes[columns]
carp_no = carp_no[columns]


In [7]:
# add category columns
carp_yes['categories'] = 'yes'
carp_no['categories'] = 'no'


In [8]:
# combine df
carp_all = pd.concat([carp_yes, carp_no], ignore_index = True)

print(len(carp_all))

641


In [9]:
# Export to csv files
carp_all.to_csv('hypopthalmichthys_selected_articles.csv', index = False)


#### Data Prep

In [48]:
# Import csv files
carp_file = pd.read_csv('hypopthalmichthys_selected_articles.csv', dtype = str)

print(len(carp_file))


641


In [49]:
# Drop any abstracts with NAs
carp_file.dropna(subset = ['Abstract'], inplace = True)

print(len(carp_file))


641


In [50]:
# Create new encoding for category column
label_encoder = LabelEncoder()
carp_file['encoding'] = label_encoder.fit_transform(carp_file['categories'])
carp_file.head()


Unnamed: 0,Author,Year,Title,Journal Name,Volume,Issue,Pages,Abstract,categories,encoding
0,"Aldridge, C. A., and E. C. Boone",2022,Simple models to quickly estimate the probable...,River Research and Applications,38,6.0,1154-1166,Species distribution models provide biologists...,yes,1
1,"Banan, A., A. Nasiri, and A. Taheri-Garavand",2020,Deep learning-based appearance features extrac...,Aquacultural Engineering,89,,,Fish species identification is vital for aquac...,yes,1
2,"Barnes, M. A., W. L. Chadderton, C. L. Jerde, ...",2021,Environmental conditions influence edna partic...,Environmental DNA,3,3.0,643-653,Knowledge about the size of environmental DNA ...,yes,1
3,"Behera, B. K., A. K. Bera, P. Paria, A. Das, P...",2018,Identification and pathogenicity of plesiomona...,Aquaculture,493,,314-318,Plesiomonas shigelloides was isolated from dis...,yes,1
4,"Borland, L. K., C. J. Mulcahy, B. A. Bennie, D...",2020,Using markov chains to quantitatively assess m...,Natural Resource Modeling,33,4.0,,Natural resource managers use barriers to dete...,yes,1


#### Text Processing

In [51]:
# Removal of HTML tags and whitespace
carp_file1 = carp_file

# Function to remove HTML tags
def basic_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.strip()
    return text
    
carp_file1['Title'] = carp_file1['Title'].apply(basic_clean)
carp_file1['Abstract'] = carp_file1['Abstract'].apply(basic_clean)


In [53]:
# Remove punctuation, remove special characters, and remove stopwords
carp_file2 = carp_file1

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text
def advanced_clean(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)

    # Function to remove stopwords from text
    def remove_stopwords(text):
        # Tokenize the text
        tokens = nltk.word_tokenize(text)
        # Filter out stopwords
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        # Reconstruct the text without stopwords
        text_without_stopwords = ' '.join(filtered_tokens)
        return text_without_stopwords

    text = remove_stopwords(text)
    
    return text

carp_file2['Title'] = carp_file2['Title'].apply(advanced_clean)
carp_file2['Abstract'] = carp_file2['Abstract'].apply(advanced_clean)



In [55]:
# Stemming
carp_file3 = carp_file2

stemmer = SnowballStemmer(language = 'english')
def stem_words(text):
    text = " ".join([stemmer.stem(word) for word in text.split()])
    return text
    
carp_file3['Title'] = carp_file3['Title'].apply(stem_words)
carp_file3['Abstract'] = carp_file3['Abstract'].apply(stem_words)


In [57]:
# Lemmatize
carp_file4 = carp_file2

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text
    
carp_file4['Title'] = carp_file4['Title'].apply(stem_words)
carp_file4['Abstract'] = carp_file4['Abstract'].apply(stem_words)


In [59]:
# Remove numerical values
carp_file5 = carp_file2

# Function to remove HTML tags
def remove_digits(text):
    text = re.sub(r'\d+', '', text)
    return text
    
carp_file5['Title'] = carp_file5['Title'].apply(remove_digits)
carp_file5['Abstract'] = carp_file5['Abstract'].apply(remove_digits)
