## Implementing a Pipeline for Diverse Models

In [1]:
import pandas as pd
import src.article_relevance as ar

annotationDF = pd.read_parquet('data/parquet/neotomaAnnotation.parquet')
embeddings_df = pd.read_parquet('data/raw/embeddingsDF.parquet')
publicationDF = pd.read_parquet('data/raw/processedDF.parquet', engine='fastparquet')
embeddings_df.head(2)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,DOI,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,10.1016/j.palaeo.2007.07.008,1.038911,0.878838,-0.478028,1.14517,-0.804706,-0.443276,0.40328,0.157089,0.289256,...,0.530903,0.437325,-1.263882,-0.826081,-0.729023,-0.429628,-0.298853,-0.566637,-0.021355,-0.13643
1,10.1016/j.quascirev.2015.03.014,1.150523,0.725789,-0.663125,1.228671,-0.501291,-0.477262,0.11032,-0.156153,0.356989,...,0.541672,0.071755,-1.269141,-1.116971,-0.744417,-0.245241,-0.026206,-0.356224,0.109391,-0.142377


In [2]:
bigDF = publicationDF.merge(embeddings_df, on = "DOI")
bigDF.head(2)

Unnamed: 0,CrossRefQueryDate,DOI,URL,abstract,author,container-title,language,published,publisher,subject,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,2023-09-12 13:50:32.389184,10.1016/j.palaeo.2007.07.008,http://dx.doi.org/10.1016/j.palaeo.2007.07.008,,"[{'given': 'Vera', 'family': 'Markgraf', 'sequ...","[Palaeogeography, Palaeoclimatology, Palaeoeco...",en,"{'date-parts': [[2007, 10]]}",Elsevier BV,"[Paleontology, Earth-Surface Processes, Ecolog...",...,0.530903,0.437325,-1.263882,-0.826081,-0.729023,-0.429628,-0.298853,-0.566637,-0.021355,-0.13643
1,2023-09-12 13:50:32.769019,10.1016/j.quascirev.2015.03.014,http://dx.doi.org/10.1016/j.quascirev.2015.03.014,,[{'ORCID': 'http://orcid.org/0000-0001-9605-82...,[Quaternary Science Reviews],en,"{'date-parts': [[2015, 5]]}",Elsevier BV,"[Geology, Archeology, Archeology, Ecology, Evo...",...,0.541672,0.071755,-1.269141,-1.116971,-0.744417,-0.245241,-0.026206,-0.356224,0.109391,-0.142377


In [3]:
completeData = bigDF.merge(annotationDF, on = 'DOI')
completeData.head(3)

Unnamed: 0,CrossRefQueryDate,DOI,URL,abstract,author,container-title,language,published,publisher,subject,...,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,index,annotation,annotator,annotationDate
0,2023-09-12 13:50:32.389184,10.1016/j.palaeo.2007.07.008,http://dx.doi.org/10.1016/j.palaeo.2007.07.008,,"[{'given': 'Vera', 'family': 'Markgraf', 'sequ...","[Palaeogeography, Palaeoclimatology, Palaeoeco...",en,"{'date-parts': [[2007, 10]]}",Elsevier BV,"[Paleontology, Earth-Surface Processes, Ecolog...",...,-0.729023,-0.429628,-0.298853,-0.566637,-0.021355,-0.13643,2081,Neotoma,Simon J. Goring,2023-09-01 11:34:37
1,2023-09-12 13:50:32.389184,10.1016/j.palaeo.2007.07.008,http://dx.doi.org/10.1016/j.palaeo.2007.07.008,,"[{'given': 'Vera', 'family': 'Markgraf', 'sequ...","[Palaeogeography, Palaeoclimatology, Palaeoeco...",en,"{'date-parts': [[2007, 10]]}",Elsevier BV,"[Paleontology, Earth-Surface Processes, Ecolog...",...,-0.729023,-0.429628,-0.298853,-0.566637,-0.021355,-0.13643,2082,Neotoma,Simon J. Goring,2023-09-01 11:34:37
2,2023-09-12 13:50:32.769019,10.1016/j.quascirev.2015.03.014,http://dx.doi.org/10.1016/j.quascirev.2015.03.014,,[{'ORCID': 'http://orcid.org/0000-0001-9605-82...,[Quaternary Science Reviews],en,"{'date-parts': [[2015, 5]]}",Elsevier BV,"[Geology, Archeology, Archeology, Ecology, Evo...",...,-0.744417,-0.245241,-0.026206,-0.356224,0.109391,-0.142377,5896,Neotoma,Simon J. Goring,2023-09-01 11:34:37


In [4]:
completeData.loc[(completeData['annotation']!= 'Neotoma'), 'target'] = 0
completeData.loc[(completeData['annotation']== 'Neotoma'), 'target'] = 1

In [5]:
completeData = completeData[completeData['validForPrediction'] == 1]
completeData.shape[0]

5779

In [6]:
X = completeData.drop(columns=['DOI', 'title', 'subtitle', 'author', 'abstract',
       'language',  'URL', 'published', 'CrossRefQueryDate', 'validForPrediction', 
       'titleSubtitleAbstract', 'target', 'annotation', 'annotator', 'annotationDate', 'index'])
y = completeData['target']
# author might lead to bias 
# we are only considering english text, so drop language
# title subtitle and abstract have already been used with the embeddings
# does it matter when it was published
# must be used subject, container-title(journal), 'publisher'

In [7]:
X.head(2)

Unnamed: 0,container-title,publisher,subject,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,"[Palaeogeography, Palaeoclimatology, Palaeoeco...",Elsevier BV,"[Paleontology, Earth-Surface Processes, Ecolog...",1.038911,0.878838,-0.478028,1.14517,-0.804706,-0.443276,0.40328,...,0.530903,0.437325,-1.263882,-0.826081,-0.729023,-0.429628,-0.298853,-0.566637,-0.021355,-0.13643
1,"[Palaeogeography, Palaeoclimatology, Palaeoeco...",Elsevier BV,"[Paleontology, Earth-Surface Processes, Ecolog...",1.038911,0.878838,-0.478028,1.14517,-0.804706,-0.443276,0.40328,...,0.530903,0.437325,-1.263882,-0.826081,-0.729023,-0.429628,-0.298853,-0.566637,-0.021355,-0.13643


In [8]:
y.tail(2)

5885    0.0
5886    0.0
Name: target, dtype: float64

## Start the Pipeline

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
y_train.value_counts()


target
1.0    4275
0.0     348
Name: count, dtype: int64

In [12]:
strFeature = ['publisher']
strTransformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')

In [13]:
subFeature = ['subject', 'container-title']
subTransformer = ar.NeotomaOneHotEncoder(min_count=10)

In [14]:
preprocessor = ColumnTransformer(
        transformers = [
            ("str_preprocessor", strTransformer, strFeature),
            ('neotoma_encoder', subTransformer, subFeature),
        ],
        remainder = "passthrough"
    )

In [15]:
# Models to Try
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
#from lightgbm.sklearn import LGBMClassifier
#from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

In [16]:
results_dict = dict()

classifiers = [
    (LogisticRegression(), {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'max_iter': [100, 1000, 10000],
        'penalty': ['l2']#,
      #  'solver': ['liblinear', 'lbfgs']
    }),
    (DecisionTreeClassifier(class_weight="balanced"), {
        'max_depth': range(10, 100, 10)
    }),
    (KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto'), {
        'n_neighbors': range(5, 100, 10)
    }),
    (BernoulliNB(alpha=1.0, binarize=0.0), {
        'alpha': [0.001, 0.01, 0.1, 1.0]
    }),
    (RandomForestClassifier(n_estimators=100, max_depth=None), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30]
    })
]

In [17]:
from sklearn.metrics import make_scorer, recall_score, f1_score, precision_score, accuracy_score
import numpy as np
import time

# Define the metrics you want to capture
classification_metrics = {
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'accuracy': make_scorer(accuracy_score)
}

In [18]:
from sklearn.impute import SimpleImputer
import joblib

best_classifiers = []

resultsDict = {'classifier': [],
               'Fit Time' :[],
               'recall': [],
               'f1': [],
               'precision': [],
               'accuracy':[]}

for classifier, param_grid in classifiers:

    classifier_name = str(type(classifier).__name__).lower()

    # Define the preprocessing pipeline
    pipeline = make_pipeline(
        preprocessor,
        SimpleImputer(strategy='constant', fill_value=0),
        classifier
    )

    param_grid = {f"{classifier_name}__{key}": value for key, value in param_grid.items()}

    randomized_search = RandomizedSearchCV(
        estimator=pipeline,  # Use the pipeline as the estimator
        param_distributions=param_grid,  # Hyperparameters go here
        scoring=classification_metrics,
        cv=5,
        n_iter=10,
        random_state=123,
        n_jobs=-1,
        refit='accuracy',
        return_train_score=True
    )

    start_time = time.time()
    randomized_search.fit(X_train, y_train)

    best_classifier = randomized_search.best_estimator_
    joblib.dump(best_classifier, f"/Users/sedv8808/HT-Data/UWisc/article-relevance/models/{classifier_name}.joblib")
    
    best_params = randomized_search.best_params_
    
    best_scores_train = {
        metric: randomized_search.cv_results_[f"mean_train_{metric}"][randomized_search.best_index_]
        for metric in classification_metrics
    }

    best_scores_test = {
        metric: randomized_search.cv_results_[f"mean_test_{metric}"][randomized_search.best_index_]
        for metric in classification_metrics
    }



    fit_time = time.time() - start_time

    classifier_name = str(type(classifier).__name__)
    resultsDict['classifier'].append(classifier_name)
    resultsDict['Fit Time'].append(fit_time)
    resultsDict['train_recall'].append(best_scores_train['recall'])
    resultsDict['train_f1'].append(best_scores_train['f1'])
    resultsDict['train_precision'].append(best_scores_train['precision'])
    resultsDict['train_accuracy'].append(best_scores_train['accuracy'])
    resultsDict['test_recall'].append(best_scores_test['recall'])
    resultsDict['test_f1'].append(best_scores_test['f1'])
    resultsDict['test_precision'].append(best_scores_test['precision'])
    resultsDict['test_accuracy'].append(best_scores_test['accuracy'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
pd.DataFrame(resultsDict)

Unnamed: 0,classifier,Fit Time,recall,f1,precision,accuracy
0,LogisticRegression,5237.960098,0.996725,0.992089,0.987506,0.985291
1,DecisionTreeClassifier,1612.105482,0.990409,0.986946,0.983518,0.975774
2,KNeighborsClassifier,3573.297831,0.997193,0.984877,0.972881,0.971662
3,BernoulliNB,3549.758078,0.795789,0.874024,0.969429,0.788007
4,RandomForestClassifier,8733.43689,0.996725,0.991626,0.986586,0.984426


In [22]:
randomized_search.cv_results_['mean_train_accuracy']

array([0.99302407, 1.        , 0.99307819, 0.99989185, 0.99902657,
       0.99913471, 1.        , 0.99259142, 1.        , 0.99913477])

## Assess all models with test data