In [1]:
# import libraries
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jredi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# Import data
df = pd.read_csv(r'data\processed\train-test.csv')
df.head()


Unnamed: 0,Author,Year,Title,Journal Name,Volume,Issue,Pages,Abstract,categories,TitleAbstract
0,"Aldridge, C. A., and E. C. Boone",2022,Simple models to quickly estimate the probable...,River Research and Applications,38,6.0,1154-1166,Species distribution models provide biologists...,1,simpl model quick estim probabl rang datalimit...
1,"Banan, A., A. Nasiri, and A. Taheri-Garavand",2020,Deep learning-based appearance features extrac...,Aquacultural Engineering,89,,,Fish species identification is vital for aquac...,1,deep learningbas appear featur extract autom c...
2,"Barnes, M. A., W. L. Chadderton, C. L. Jerde, ...",2021,Environmental conditions influence edna partic...,Environmental DNA,3,3.0,643-653,Knowledge about the size of environmental DNA ...,1,environment condit influenc edna particl size ...
3,"Behera, B. K., A. K. Bera, P. Paria, A. Das, P...",2018,Identification and pathogenicity of plesiomona...,Aquaculture,493,,314-318,Plesiomonas shigelloides was isolated from dis...,1,identif pathogen plesiomona shigelloid silver ...
4,"Borland, L. K., C. J. Mulcahy, B. A. Bennie, D...",2020,Using markov chains to quantitatively assess m...,Natural Resource Modeling,33,4.0,,Natural resource managers use barriers to dete...,1,use markov chain quantit assess movement patte...


In [5]:
# Prepare data

# Create label encoder 
label_encoder = LabelEncoder()
df["categories"] = label_encoder.fit_transform(df['categories'])

In [6]:
# Text preprocessing

# Removal of HTML tags and whitespace
df_clean = df.copy()

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove HTML tags
def basic_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.strip()
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)

    # Function to remove stopwords from text
    def remove_stopwords(text):
        # Tokenize the text
        tokens = nltk.word_tokenize(text)
        # Filter out stopwords
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        # Reconstruct the text without stopwords
        text_without_stopwords = ' '.join(filtered_tokens)
        return text_without_stopwords

    text = remove_stopwords(text)

    stemmer = SnowballStemmer(language = 'english')
    text = " ".join([stemmer.stem(word) for word in text.split()])

    return text
    
df_clean['TitleAbstract'] = df_clean['TitleAbstract'].apply(basic_clean)


In [20]:
df["categories"].value_counts()

categories
0    408
1    233
Name: count, dtype: int64

In [21]:
# Create balanced datasets for article selection
balanced_dfs = []

# Count the number of rows in each category
category_counts = df['categories'].value_counts()

# Find the minority category
minority_category = category_counts.idxmin()

# Get the size of the minority category
minority_category_size = category_counts[minority_category]

# Sample rows from the majority category to match the size of the minority category
majority_category_rows = df[df['categories'] != minority_category]
balanced_majority_category_rows = majority_category_rows.sample(n=minority_category_size, random_state=42)

# Get the minority category rows
minority_category_rows = df[df['categories'] == minority_category]

# Concatenate the minority and balanced majority category rows
balanced_df = pd.concat([minority_category_rows, balanced_majority_category_rows])

balanced_dfs.append(balanced_df)


In [29]:
# Train test split
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(balanced_df['TitleAbstract'], balanced_df['categories'], test_size = 0.2, random_state = 0)

In [30]:
# Vectorize
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [31]:
print("Training features shape:", X_train_vec.shape)
print("Test features shape:", X_test_vec.shape)


Training features shape: (372, 6811)
Test features shape: (94, 6811)


In [34]:
# Define model parameters
models = {
    "LR": (LogisticRegression(), {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs', 'saga'],
        'multi_class': ['ovr', 'multinomial'],
    }),
    "kNN": (KNeighborsClassifier(), {
        'n_neighbors': list(range(1, 21)),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    })
}

In [33]:
# Define model parameters
models = {
    "LR": (LogisticRegression(), {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs', 'saga'],
        'multi_class': ['ovr', 'multinomial'],
    }),
    "kNN": (KNeighborsClassifier(), {
        'n_neighbors': list(range(1, 21)),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }),
    "SVM": (SVC(), {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['linear', 'rbf', 'sigmoid'] # decided not to test 'poly'
    }),
    "Random Forest": (RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }),
    "Decision Tree": (DecisionTreeClassifier(), {
        'max_depth': [None, 2, 4, 6, 8],
        'min_samples_split': [2, 5, 10]
    }),
    "Naive Bayes": (MultinomialNB(), {
        'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
    }),
    "XGBoost": (xgb.XGBClassifier(), {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'subsample': [0.5, 0.8, 1.0],
        'colsample_bytree': [0.5, 0.8, 1.0]
    })
}

In [35]:
# Initialize a dictionary to hold the best models
best_models = {}

# Grid Search for each model
for name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv = 10, scoring='accuracy')
    grid_search.fit(X_train_vec, y_train)  # Fit the model
    best_models[name] = grid_search.best_estimator_  # Store the best model
    best_accuracy = grid_search.best_score_

print(best_models)

400 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jredi\Documents\coding\invasive-fish-ML-article-selection\venvSelectArticle\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jredi\Documents\coding\invasive-fish-ML-article-selection\venvSelectArticle\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jredi\Documents\coding\invasive-fish-ML-article-selection\venvSelectArticle\Lib\site-p

{'LR': LogisticRegression(C=100, multi_class='multinomial', penalty='l1',
                   solver='saga'), 'kNN': KNeighborsClassifier(n_neighbors=9)}


In [36]:
best_models


{'LR': LogisticRegression(C=100, multi_class='multinomial', penalty='l1',
                    solver='saga'),
 'kNN': KNeighborsClassifier(n_neighbors=9)}

In [40]:
y_pred


array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1], dtype=int64)

In [41]:
y_test

416    0
505    0
134    1
90     1
431    0
      ..
359    0
588    0
55     1
160    1
135    1
Name: categories, Length: 94, dtype: int64

In [37]:
# Evaluate the best models on the test set
metrics_list = []

for name, model in best_models.items():
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average ='weighted', zero_division = 0)
    recall = recall_score(y_test, y_pred, average ='weighted')
    f1 = f1_score(y_test, y_pred, average ='weighted')
    metrics = [name, accuracy, precision, recall, f1]
    metrics_list.append(metrics)
    
column_names = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1']
result_df = pd.DataFrame(metrics_list, columns=column_names)
print(result_df)

  Model  Accuracy  Precision    Recall        F1
0    LR  0.755319   0.757025  0.755319  0.755402
1   kNN  0.744681   0.747359  0.744681  0.742927


In [38]:
# Export best model
max_index = result_df['Accuracy'].idxmax()
best_model = best_models.get(result_df.loc[max_index,'Model'])

In [39]:
# Save model
with open('models/ml_model.pkl', 'wb') as file:
    pickle.dump((best_model, vectorizer), file)