In [18]:
# import libraries
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
nltk.download('stopwords')

from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jredi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Import data
df = pd.read_csv(r'..\data\processed\train-test.csv')
df.head()


Unnamed: 0,Author,Year,Title,Journal Name,Volume,Issue,Pages,Abstract,categories,TitleAbstract
0,"Aldridge, C. A., and E. C. Boone",2022,Simple models to quickly estimate the probable...,River Research and Applications,38,6.0,1154-1166,Species distribution models provide biologists...,1,simpl model quick estim probabl rang datalimit...
1,"Banan, A., A. Nasiri, and A. Taheri-Garavand",2020,Deep learning-based appearance features extrac...,Aquacultural Engineering,89,,,Fish species identification is vital for aquac...,1,deep learningbas appear featur extract autom c...
2,"Barnes, M. A., W. L. Chadderton, C. L. Jerde, ...",2021,Environmental conditions influence edna partic...,Environmental DNA,3,3.0,643-653,Knowledge about the size of environmental DNA ...,1,environment condit influenc edna particl size ...
3,"Behera, B. K., A. K. Bera, P. Paria, A. Das, P...",2018,Identification and pathogenicity of plesiomona...,Aquaculture,493,,314-318,Plesiomonas shigelloides was isolated from dis...,1,identif pathogen plesiomona shigelloid silver ...
4,"Borland, L. K., C. J. Mulcahy, B. A. Bennie, D...",2020,Using markov chains to quantitatively assess m...,Natural Resource Modeling,33,4.0,,Natural resource managers use barriers to dete...,1,use markov chain quantit assess movement patte...


In [4]:
# Create label encoder 
label_encoder = LabelEncoder()
df["categories"] = label_encoder.fit_transform(df['categories'])

In [5]:
# Text preprocessing

# Removal of HTML tags and whitespace
df_clean = df.copy()

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove HTML tags
def basic_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.strip()
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)

    # Function to remove stopwords from text
    def remove_stopwords(text):
        # Tokenize the text
        tokens = nltk.word_tokenize(text)
        # Filter out stopwords
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        # Reconstruct the text without stopwords
        text_without_stopwords = ' '.join(filtered_tokens)
        return text_without_stopwords

    text = remove_stopwords(text)

    stemmer = SnowballStemmer(language = 'english')
    text = " ".join([stemmer.stem(word) for word in text.split()])

    return text
    
df_clean['TitleAbstract'] = df_clean['TitleAbstract'].apply(basic_clean)


In [20]:
df_clean["categories"].value_counts()

categories
0    408
1    233
Name: count, dtype: int64

In [21]:
# Create X and y
X = df_clean['TitleAbstract']
y = df_clean['categories']

In [8]:
# # Imbalanced data set - since we consistently saw more irrevlant document, it would be better to train model on imbalanced data set

# # Create X and y
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# for train_index, test_index in sss.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    

### Balancing Dataset: 
I want to prioritize the reduction of false negatives when selecting relevant articles. I decided to balance dataset to give mode a better chance of learning to detect relevant articles.


In [22]:
# Create balanced datasets for article selection
balanced_dfs = []

# Count the number of rows in each category
category_counts = df['categories'].value_counts()

# Find the minority category
minority_category = category_counts.idxmin()

# Get the size of the minority category
minority_category_size = category_counts[minority_category]

# Sample rows from the majority category to match the size of the minority category
majority_category_rows = df[df['categories'] != minority_category]
balanced_majority_category_rows = majority_category_rows.sample(n=minority_category_size, random_state=42)

# Get the minority category rows
minority_category_rows = df[df['categories'] == minority_category]

# Concatenate the minority and balanced majority category rows
balanced_df = pd.concat([minority_category_rows, balanced_majority_category_rows])

balanced_dfs.append(balanced_df)


In [25]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(balanced_df['TitleAbstract'], balanced_df['categories'], test_size = 0.2, random_state = 0)


In [23]:
# Define vectorizers
vectorizers = {
    "CountVectorizer": CountVectorizer(),
    "TFIDFVectorizer": TfidfVectorizer(),
    "HashingVectorizer": HashingVectorizer(n_features=2**10)
}

In [29]:
# Define model parameters
models = {
    "LR": (LogisticRegression(), {
        'model__penalty': ['l1', 'l2', 'elasticnet'],
        'model__C': [0.1, 1, 10, 100],
        'model__solver': ['liblinear', 'lbfgs', 'saga'],
        'model__multi_class': ['ovr', 'multinomial'],
    }),
    "kNN": (KNeighborsClassifier(), {
        'model__n_neighbors': list(range(1, 21)),
        'model__weights': ['uniform', 'distance'],
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }),
    "SVM": (SVC(), {
        'model__C': [0.1, 1, 10, 100],
        'model__gamma': [1, 0.1, 0.01, 0.001],
        'model__kernel': ['linear', 'rbf', 'sigmoid'] # decided not to test 'poly'
    }),
    "Random Forest": (RandomForestClassifier(), {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5]
    }),
    "Decision Tree": (DecisionTreeClassifier(), {
        'model__max_depth': [None, 2, 4, 6, 8],
        'model__min_samples_split': [2, 5, 10]
    }),
    "Naive Bayes": (MultinomialNB(), {
        'model__alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
    })
}

In [33]:
# Initialize a dictionary to hold the best models
best_models = {}

# Grid Search for each vectorizer and model
for vec_name, vectorizer in vectorizers.items():
    for name, (model, params) in models.items():

        # Skip MultinomialNB with HashingVectorizer
        if name == "Naive Bayes" and vec_name == "HashingVectorizer":
            print(f"Skipping {vec_name} with {name} due to incompatible values.")
            continue
        
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('model', model)
        ])
        
        grid_search = GridSearchCV(pipeline, params, cv = 10, scoring = 'recall')
        grid_search.fit(X_train, y_train)  # Fit the model
        best_models[f"{vec_name}_{name}"] = grid_search.best_estimator_  # Store the best model
        best_accuracy = grid_search.best_score_

print(best_models)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Skipping HashingVectorizer with Naive Bayes due to incompatible values.
{'CountVectorizer_LR': Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('model',
                 LogisticRegression(C=1, multi_class='ovr',
                                    solver='liblinear'))]), 'CountVectorizer_kNN': Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('model', KNeighborsClassifier(n_neighbors=11))]), 'CountVectorizer_SVM': Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('model', SVC(C=0.1, gamma=0.01, kernel='sigmoid'))]), 'CountVectorizer_Random Forest': Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('model',
                 RandomForestClassifier(min_samples_split=5,
                                        n_estimators=200))]), 'CountVectorizer_Decision Tree': Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('model', DecisionTreeClassifier(max_depth=2))]), 'CountVectorizer_Naive Bayes': Pipeline

In [34]:
best_models

{'CountVectorizer_LR': Pipeline(steps=[('vectorizer', CountVectorizer()),
                 ('model',
                  LogisticRegression(C=1, multi_class='ovr',
                                     solver='liblinear'))]),
 'CountVectorizer_kNN': Pipeline(steps=[('vectorizer', CountVectorizer()),
                 ('model', KNeighborsClassifier(n_neighbors=11))]),
 'CountVectorizer_SVM': Pipeline(steps=[('vectorizer', CountVectorizer()),
                 ('model', SVC(C=0.1, gamma=0.01, kernel='sigmoid'))]),
 'CountVectorizer_Random Forest': Pipeline(steps=[('vectorizer', CountVectorizer()),
                 ('model',
                  RandomForestClassifier(min_samples_split=5,
                                         n_estimators=200))]),
 'CountVectorizer_Decision Tree': Pipeline(steps=[('vectorizer', CountVectorizer()),
                 ('model', DecisionTreeClassifier(max_depth=2))]),
 'CountVectorizer_Naive Bayes': Pipeline(steps=[('vectorizer', CountVectorizer()),
               

In [37]:
# Evaluate the best models on the test set
metrics_list = []

for name, model in best_models.items():
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['model']
    
    # Transform the test data
    X_test_vec = vectorizer.transform(X_test)  # Transform X_test with the same vectorizer used in training
    
    # Make predictions
    y_pred = classifier.predict(X_test_vec)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store metrics
    metrics = [name, accuracy, precision, recall, f1]
    metrics_list.append(metrics)
    
# Create a DataFrame with the results
column_names = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1']
result_df = pd.DataFrame(metrics_list, columns=column_names)
print(result_df)

                              Model  Accuracy  Precision    Recall        F1
0                CountVectorizer_LR  0.744681   0.745586  0.744681  0.744796
1               CountVectorizer_kNN  0.521277   0.511847  0.521277  0.406375
2               CountVectorizer_SVM  0.691489   0.696258  0.691489  0.687325
3     CountVectorizer_Random Forest  0.712766   0.713899  0.712766  0.711290
4     CountVectorizer_Decision Tree  0.489362   0.508286  0.489362  0.423730
5       CountVectorizer_Naive Bayes  0.744681   0.759140  0.744681  0.738967
6                TFIDFVectorizer_LR  0.776596   0.776518  0.776596  0.776469
7               TFIDFVectorizer_kNN  0.712766   0.718544  0.712766  0.708888
8               TFIDFVectorizer_SVM  0.765957   0.766016  0.765957  0.765638
9     TFIDFVectorizer_Random Forest  0.712766   0.713899  0.712766  0.711290
10    TFIDFVectorizer_Decision Tree  0.574468   0.574722  0.574468  0.564946
11      TFIDFVectorizer_Naive Bayes  0.734043   0.788830  0.734043  0.717038

In [38]:
# Export best model
max_index = result_df['Recall'].idxmax()
best_model = best_models.get(result_df.loc[max_index,'Model'])
print(best_model)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model',
                 LogisticRegression(C=1, multi_class='ovr',
                                    solver='liblinear'))])


In [27]:
# Save model
with open('../models/ml_model.pkl', 'wb') as file:
    pickle.dump((best_model, vectorizer), file)