# Embeddings & Training

In [1]:
import pandas as pd
import src.article_relevance as ar
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
publicationDF = pd.read_parquet('data/parquet/publicationMetadataDF.parquet', engine='fastparquet')
annotationDF = pd.read_parquet('data/parquet/AnnotationDF.parquet')
publicationDF.shape[0]

2670

In [3]:
annotationDF.shape[0]

2700

In [4]:
annotationDF['annotation'].value_counts()

annotation
Not Neotoma      1775
Neotoma           852
Maybe Neotoma      73
Name: count, dtype: int64

In [5]:
publicationDF['DOI'].value_counts().sort_values()

DOI
10.1177/095968369400400404          1
10.1073/pnas.1222239110             1
10.1016/j.foreco.2010.05.001        1
10.1016/j.revpalbo.2011.05.003      1
10.1111/j.1365-2699.2011.02618.x    1
                                   ..
10.5194/bg-2016-354                 1
10.1080/02724634.2015.1113803       1
10.1016/s0367-2530(17)30981-7       1
10.1016/j.quaint.2005.03.004        1
10.1540/jsmr.59.28                  1
Name: count, Length: 2670, dtype: int64

## Prediction Parquet

The Prediction PQ file will contain the following columns:
```python
['DOI', 'prediction', 'predict_proba', 'model_metadata', 'prediction_date']
```

### Adding Embeddings to the publicationDF

In [6]:
print(datetime.now())
embeddingsDF = ar.addEmbeddings(publicationDF, 'titleSubtitleAbstract')
print(datetime.now())

2023-10-10 09:48:16.366822
2023-10-10 09:48:16,367 - addEmbeddings.py:20 - addEmbeddings - INFO - Starting Sentence Embedding.


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 10810.06it/s]

2023-10-10 09:48:31,835 - addEmbeddings.py:34 - addEmbeddings - INFO - Tokenizing sentences and creating embeddings





2023-10-10 11:11:36,571 - addEmbeddings.py:61 - addEmbeddings - INFO - Sentence embedding completed.
2023-10-10 11:11:37.404899


In [7]:
embeddingsDF.columns

Index(['CrossRefQueryDate', 'DOI', 'URL', 'abstract', 'author',
       'container-title', 'language', 'published', 'publisher', 'subject',
       ...
       'embedding_758', 'embedding_759', 'embedding_760', 'embedding_761',
       'embedding_762', 'embedding_763', 'embedding_764', 'embedding_765',
       'embedding_766', 'embedding_767'],
      dtype='object', length=782)

In [8]:
selectedCols = [col for col in embeddingsDF.columns if col.startswith("embedding_")]

In [9]:
selectedCols.append("DOI")
selectedCols.sort(key=lambda col: col != "DOI")


In [10]:
embeddingsDF = embeddingsDF.loc[:, selectedCols]

In [11]:
embeddingsDF.head(3)

Unnamed: 0,DOI,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,10.1016/j.quascirev.2014.04.014,1.460713,1.039954,-0.695859,1.206938,-1.106794,-0.166607,0.261164,-0.251705,0.275001,...,0.750866,0.160514,-1.333266,-1.030996,-0.712262,-0.215967,-0.134665,-0.285963,0.342186,-0.005781
1,10.1002/gea.10020,0.77071,1.049734,-0.231182,0.321153,-1.076258,0.213778,-0.017972,-0.489939,0.222379,...,-0.08044,-0.000156,-0.561089,-0.206273,-0.695066,-0.604755,0.050028,-0.094507,-0.153755,-0.240465
2,10.1139/e80-122,0.532121,0.436422,-0.784752,0.379681,-0.0576,-0.036988,1.149151,-0.22361,-0.165904,...,-0.175802,-0.785979,-0.373426,-0.472257,-0.790648,-1.567279,-0.251331,-0.473002,-0.605666,0.727403


In [12]:
embeddingsDF['DOI'].value_counts().sort_values()

DOI
10.1177/095968369400400404          1
10.1073/pnas.1222239110             1
10.1016/j.foreco.2010.05.001        1
10.1016/j.revpalbo.2011.05.003      1
10.1111/j.1365-2699.2011.02618.x    1
                                   ..
10.5194/bg-2016-354                 1
10.1080/02724634.2015.1113803       1
10.1016/s0367-2530(17)30981-7       1
10.1016/j.quaint.2005.03.004        1
10.29262/ram.v68i4.842              1
Name: count, Length: 2670, dtype: int64

In [13]:
publicationDF.shape[0]

2670

In [14]:
embeddingsDF.shape[0]

2670

In [15]:
#embeddingsDF.to_parquet('data/parquet/embeddingsDF.parquet', engine='fastparquet', compression='snappy', index=False)

# Run From Here
## (if embeddings file is available)

In [16]:
import pandas as pd
import src.article_relevance as ar

publicationDF = pd.read_parquet('data/parquet/publicationMetadataDF.parquet', engine='fastparquet')
annotationDF = pd.read_parquet('data/parquet/AnnotationDF.parquet')
embeddingsDF = pd.read_parquet('data/parquet/embeddingsDF.parquet')

In [17]:
embeddingsDF = publicationDF.merge(embeddingsDF, on = "DOI")
embeddingsDF.head(1)

Unnamed: 0,CrossRefQueryDate,DOI,URL,abstract,author,container-title,language,published,publisher,subject,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,2023-10-06 22:41:36.395841,10.1016/j.quascirev.2014.04.014,http://dx.doi.org/10.1016/j.quascirev.2014.04.014,,[{'ORCID': 'http://orcid.org/0000-0001-5732-37...,Quaternary Science Reviews,en,"{'date-parts': [[2014, 6]]}",Elsevier BV,"[Geology, Archeology, Archeology, Ecology, Evo...",...,0.750866,0.160514,-1.333266,-1.030996,-0.712262,-0.215967,-0.134665,-0.285963,0.342186,-0.005781


In [18]:
completeDF = embeddingsDF.merge(annotationDF, on = 'DOI')
completeDF.head(1)

Unnamed: 0,CrossRefQueryDate,DOI,URL,abstract,author,container-title,language,published,publisher,subject,...,embedding_764,embedding_765,embedding_766,embedding_767,annotation,annotator,annotationDate,verified,verifiedBy,verifiedTimeStamp
0,2023-10-06 22:41:36.395841,10.1016/j.quascirev.2014.04.014,http://dx.doi.org/10.1016/j.quascirev.2014.04.014,,[{'ORCID': 'http://orcid.org/0000-0001-5732-37...,Quaternary Science Reviews,en,"{'date-parts': [[2014, 6]]}",Elsevier BV,"[Geology, Archeology, Archeology, Ecology, Evo...",...,-0.134665,-0.285963,0.342186,-0.005781,Neotoma,Simon J. Goring,2023-10-06 23:05:47,No,,


In [19]:
completeDF.loc[(completeDF['annotation']!= 'Neotoma'), 'target'] = 0
completeDF.loc[(completeDF['annotation']== 'Neotoma'), 'target'] = 1
completeDF.loc[(completeDF['annotation']== 'Maybe Neotoma'), 'target'] = 1

In [20]:
completeDF['target'].value_counts()

target
0.0    1754
1.0     875
Name: count, dtype: int64

In [21]:
completeDF['DOI'].value_counts()

DOI
10.1540/jsmr.59.28                              1
10.1016/j.quascirev.2014.04.014                 1
10.1002/gea.10020                               1
10.1139/e80-122                                 1
10.1016/j.quaint.2015.05.009                    1
                                               ..
10.1139/e04-081                                 1
10.1130/0091-7613(2000)28<51:rroatv>2.0.co;2    1
10.1080/00173134.2014.927916                    1
10.1016/j.yqres.2006.11.004                     1
10.1016/j.yqres.2007.12.002                     1
Name: count, Length: 2629, dtype: int64

In [22]:
selectedCols = [col for col in completeDF.columns if col.startswith('embedding_')]
selectedCols = selectedCols + ['subject', 'container-title']
selectedCols.sort(key=lambda col: (col != 'subject') & (col != 'container-title'))

In [23]:
X = completeDF[selectedCols]
y = completeDF['target']
# author might lead to bias 
# we are only considering english text, so drop language
# title subtitle and abstract have already been used with the embeddings
# does it matter when it was published
# must be used subject, container-title(journal), 'publisher'

In [24]:
X.shape[0]

2629

In [25]:
y.shape[0]

2629

## Feature selection

``` markdown
Neotoma Encoder
['subject']

Count Vectorizer (BOW)
['container-title']
```

## Start the Pipeline

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
resultsDict = ar.relevancePredictTrain(X_train, y_train)

Setting up features
Beginning training
Training logisticregression.
Starting fit at 2023-10-10_11-15-45
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process jus

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training decisiontreeclassifier.
Starting fit at 2023-10-10_11-16-16




Training kneighborsclassifier.
Starting fit at 2023-10-10_11-17-00
Training bernoullinb.
Starting fit at 2023-10-10_11-17-25




Training randomforestclassifier.
Starting fit at 2023-10-10_11-17-37
finished process; returning results


In [29]:
pd.DataFrame(resultsDict['report'][0])

Unnamed: 0,classifier,Fit Time,train_recall,train_f1,train_precision,train_accuracy,test_recall,test_f1,test_precision,test_accuracy
0,LogisticRegression,0 days 00:00:31.085787,0.882713,0.929025,0.980495,0.955302,0.740298,0.778817,0.821906,0.860675
1,DecisionTreeClassifier,0 days 00:00:43.942452,0.922179,0.916624,0.911952,0.944483,0.63999,0.624444,0.612665,0.745627
2,KNeighborsClassifier,0 days 00:00:24.852635,0.745343,0.740661,0.736137,0.827033,0.700154,0.687811,0.67624,0.789345
3,BernoulliNB,0 days 00:00:11.834530,0.729207,0.579243,0.480476,0.648956,0.710082,0.568863,0.474844,0.643826
4,RandomForestClassifier,0 days 00:01:22.419802,1.0,0.999821,0.999642,0.999881,0.621141,0.706785,0.822122,0.829756


## Evaluating

In [30]:
testResults = list()

for counter, model in enumerate(resultsDict['model']):
    individualResults = dict()
    individualResults['model_name'] = resultsDict['model_name'][counter]
    print(resultsDict['model_name'][counter])
    individualResults['prediction'] = model.predict(X_test)
    individualResults['predict_proba'] = model.predict_proba(X_test)
    testResults.append(individualResults)

LogisticRegression
DecisionTreeClassifier
KNeighborsClassifier
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



BernoulliNB
RandomForestClassifier


In [31]:
testResults[0].keys()

dict_keys(['model_name', 'prediction', 'predict_proba'])

In [32]:
predictionsDF = pd.DataFrame()
for i, result in enumerate(testResults):
    name = testResults[i]['model_name']
    y_hat = '_prediction'
    col_name = name+y_hat
    predictionsDF[col_name] = testResults[i]['prediction']
    y_hat = '_predProba'
    col_name = name+y_hat
    predictionsDF[col_name] = testResults[i]['predict_proba'][:,1]

predictionsDF.index = y_test.index
predictionsDF = predictionsDF.join(y_test)
#predictionsDF = pd.concat([y_test, predictionsDF.reset_index(drop=True)], axis=1)
print(predictionsDF.shape)

(526, 11)


In [33]:
predictionsDF

Unnamed: 0,LogisticRegression_prediction,LogisticRegression_predProba,DecisionTreeClassifier_prediction,DecisionTreeClassifier_predProba,KNeighborsClassifier_prediction,KNeighborsClassifier_predProba,BernoulliNB_prediction,BernoulliNB_predProba,RandomForestClassifier_prediction,RandomForestClassifier_predProba,target
221,1.0,0.990222,0.0,0.000000,1.0,0.600000,0.0,9.314396e-06,1.0,0.53,1.0
318,1.0,0.951752,1.0,1.000000,1.0,0.933333,1.0,1.000000e+00,1.0,0.68,1.0
926,0.0,0.027619,0.0,0.134327,0.0,0.133333,0.0,2.184222e-25,0.0,0.07,0.0
2489,1.0,0.956433,1.0,0.931151,1.0,0.600000,0.0,3.418075e-18,0.0,0.40,1.0
1420,0.0,0.000758,0.0,0.134327,0.0,0.066667,0.0,3.759976e-21,0.0,0.11,0.0
...,...,...,...,...,...,...,...,...,...,...,...
76,0.0,0.383755,0.0,0.421443,1.0,0.533333,1.0,1.000000e+00,0.0,0.41,1.0
2581,0.0,0.094495,0.0,0.134327,0.0,0.266667,0.0,5.241724e-76,0.0,0.27,0.0
620,1.0,0.861018,1.0,1.000000,1.0,0.800000,0.0,2.203860e-02,1.0,0.58,1.0
1151,0.0,0.114453,0.0,0.015799,0.0,0.066667,0.0,8.450976e-28,0.0,0.13,0.0


In [34]:
from sklearn.metrics import classification_report, confusion_matrix


In [35]:
classification_rep = classification_report(predictionsDF['target'] , predictionsDF['LogisticRegression_prediction'])

print("Log Reg Classification Report: \n ", classification_rep)

Log Reg Classification Report: 
                precision    recall  f1-score   support

         0.0       0.89      0.93      0.91       348
         1.0       0.85      0.78      0.82       178

    accuracy                           0.88       526
   macro avg       0.87      0.86      0.86       526
weighted avg       0.88      0.88      0.88       526



In [36]:
classification_rep = classification_report(predictionsDF['target'] , predictionsDF['DecisionTreeClassifier_prediction'])

print("Decision Tree Classification Report: \n ", classification_rep)

Decision Tree Classification Report: 
                precision    recall  f1-score   support

         0.0       0.81      0.81      0.81       348
         1.0       0.63      0.63      0.63       178

    accuracy                           0.75       526
   macro avg       0.72      0.72      0.72       526
weighted avg       0.75      0.75      0.75       526



In [37]:
classification_rep = classification_report(predictionsDF['target'] , predictionsDF['KNeighborsClassifier_prediction'])

print("KNN Classification Report: \n ", classification_rep)

KNN Classification Report: 
                precision    recall  f1-score   support

         0.0       0.86      0.86      0.86       348
         1.0       0.73      0.72      0.72       178

    accuracy                           0.81       526
   macro avg       0.79      0.79      0.79       526
weighted avg       0.81      0.81      0.81       526



In [38]:
classification_rep = classification_report(predictionsDF['target'] , predictionsDF['BernoulliNB_prediction'])

print("Bernoulli NB Classification Report: \n ", classification_rep)

Bernoulli NB Classification Report: 
                precision    recall  f1-score   support

         0.0       0.77      0.64      0.70       348
         1.0       0.47      0.63      0.54       178

    accuracy                           0.64       526
   macro avg       0.62      0.64      0.62       526
weighted avg       0.67      0.64      0.65       526



In [39]:
classification_rep = classification_report(predictionsDF['target'] , predictionsDF['RandomForestClassifier_prediction'])

print("Random Forest Classification Report: \n ", classification_rep)

Random Forest Classification Report: 
                precision    recall  f1-score   support

         0.0       0.85      0.97      0.91       348
         1.0       0.93      0.65      0.77       178

    accuracy                           0.87       526
   macro avg       0.89      0.81      0.84       526
weighted avg       0.87      0.87      0.86       526



In [40]:
predictionsDF.merge(completeDF[['DOI']], left_index=True, right_index=True, how='left')


Unnamed: 0,LogisticRegression_prediction,LogisticRegression_predProba,DecisionTreeClassifier_prediction,DecisionTreeClassifier_predProba,KNeighborsClassifier_prediction,KNeighborsClassifier_predProba,BernoulliNB_prediction,BernoulliNB_predProba,RandomForestClassifier_prediction,RandomForestClassifier_predProba,target,DOI
221,1.0,0.990222,0.0,0.000000,1.0,0.600000,0.0,9.314396e-06,1.0,0.53,1.0,10.7202/032477ar
318,1.0,0.951752,1.0,1.000000,1.0,0.933333,1.0,1.000000e+00,1.0,0.68,1.0,10.1073/pnas.1604903113
926,0.0,0.027619,0.0,0.134327,0.0,0.133333,0.0,2.184222e-25,0.0,0.07,0.0,10.1016/bs.vh.2022.10.007
2489,1.0,0.956433,1.0,0.931151,1.0,0.600000,0.0,3.418075e-18,0.0,0.40,1.0,10.1016/j.scitotenv.2022.153829
1420,0.0,0.000758,0.0,0.134327,0.0,0.066667,0.0,3.759976e-21,0.0,0.11,0.0,10.3389/fimmu.2020.559746
...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.0,0.383755,0.0,0.421443,1.0,0.533333,1.0,1.000000e+00,0.0,0.41,1.0,10.1038/nature09077
2581,0.0,0.094495,0.0,0.134327,0.0,0.266667,0.0,5.241724e-76,0.0,0.27,0.0,10.1371/journal.pone.0232682
620,1.0,0.861018,1.0,1.000000,1.0,0.800000,0.0,2.203860e-02,1.0,0.58,1.0,10.1017/s0033822200000710
1151,0.0,0.114453,0.0,0.015799,0.0,0.066667,0.0,8.450976e-28,0.0,0.13,0.0,10.15586/aei.v51i2.796


### For the Complete X, just for creation of Parquet File

In [41]:
testResults = list()
for counter, model in enumerate(resultsDict['model']):
    individualResults = dict()
    individualResults['model_name'] = resultsDict['model_name'][counter]
    print(resultsDict['model_name'][counter])
    individualResults['prediction'] = model.predict(X)
    individualResults['predict_proba'] = model.predict_proba(X)
    testResults.append(individualResults)

LogisticRegression
DecisionTreeClassifier
KNeighborsClassifier
BernoulliNB
RandomForestClassifier


In [42]:
y.shape

(2629,)

In [43]:
X.shape[0]

2629

In [44]:
predictionsDF = pd.DataFrame()

for i, result in enumerate(testResults):
    print(testResults[i]['model_name'])
    name = testResults[i]['model_name']
    y_hat = '_prediction'
    col_name = name+y_hat
    predictionsDF[col_name] = testResults[i]['prediction']
    y_hat = '_predProba'
    col_name = name+y_hat
    predictionsDF[col_name] = testResults[i]['predict_proba'][:,1]

print(predictionsDF.shape[0])

predictionsDF.index = y.index
predictionsDF = predictionsDF.join(y)

LogisticRegression
DecisionTreeClassifier
KNeighborsClassifier
BernoulliNB
RandomForestClassifier
2629


In [45]:
predictionsDF.head(3)

Unnamed: 0,LogisticRegression_prediction,LogisticRegression_predProba,DecisionTreeClassifier_prediction,DecisionTreeClassifier_predProba,KNeighborsClassifier_prediction,KNeighborsClassifier_predProba,BernoulliNB_prediction,BernoulliNB_predProba,RandomForestClassifier_prediction,RandomForestClassifier_predProba,target
0,1.0,0.994494,1.0,0.934448,1.0,0.733333,1.0,1.0,1.0,0.89,1.0
1,1.0,0.55127,0.0,0.320705,0.0,0.4,1.0,0.998807,1.0,0.72,1.0
2,1.0,0.998998,1.0,0.931151,1.0,1.0,1.0,0.999786,1.0,0.87,1.0


In [46]:
predictionsDF.merge(completeDF[['DOI']], left_index=True, right_index=True, how='left')


Unnamed: 0,LogisticRegression_prediction,LogisticRegression_predProba,DecisionTreeClassifier_prediction,DecisionTreeClassifier_predProba,KNeighborsClassifier_prediction,KNeighborsClassifier_predProba,BernoulliNB_prediction,BernoulliNB_predProba,RandomForestClassifier_prediction,RandomForestClassifier_predProba,target,DOI
0,1.0,0.994494,1.0,0.934448,1.0,0.733333,1.0,1.000000e+00,1.0,0.89,1.0,10.1016/j.quascirev.2014.04.014
1,1.0,0.551270,0.0,0.320705,0.0,0.400000,1.0,9.988071e-01,1.0,0.72,1.0,10.1002/gea.10020
2,1.0,0.998998,1.0,0.931151,1.0,1.000000,1.0,9.997864e-01,1.0,0.87,1.0,10.1139/e80-122
3,1.0,0.981018,1.0,0.934448,1.0,0.866667,1.0,1.000000e+00,1.0,0.92,1.0,10.1016/j.quaint.2015.05.009
4,1.0,0.990198,1.0,0.931151,1.0,0.866667,1.0,9.980496e-01,1.0,0.94,1.0,10.1191/0959683604hl761rp
...,...,...,...,...,...,...,...,...,...,...,...,...
2624,0.0,0.064652,0.0,0.134327,0.0,0.200000,0.0,6.938330e-79,0.0,0.13,0.0,10.3390/genes12030432
2625,0.0,0.094495,0.0,0.134327,0.0,0.266667,0.0,5.241724e-76,0.0,0.27,0.0,10.1371/journal.pone.0211990
2626,0.0,0.089111,0.0,0.134327,0.0,0.266667,0.0,4.934267e-76,0.0,0.16,0.0,10.1371/journal.pbio.3000764
2627,0.0,0.158769,0.0,0.134327,0.0,0.133333,0.0,1.523840e-75,0.0,0.00,0.0,10.1007/s11356-022-19414-5


In [48]:
#predictionsDF.to_parquet('data/parquet/neotomaPredictions.parquet', index=False)

In [49]:
import joblib
my_dict = joblib.load('/Users/sedv8808/HT-Data/UWisc/article-relevance/results/Iteration_2023-10-10_11-17-37.joblib')

In [50]:
my_dict

{'model_name': ['LogisticRegression',
  'DecisionTreeClassifier',
  'KNeighborsClassifier',
  'BernoulliNB',
  'RandomForestClassifier'],
 'model': [Pipeline(steps=[('columntransformer',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('str_preprocessor',
                                                    CountVectorizer(max_features=100,
                                                                    stop_words=['i',
                                                                                'me',
                                                                                'my',
                                                                                'myself',
                                                                                'we',
                                                                                'our',
                                                                         

In [51]:
my_dict.keys()

dict_keys(['model_name', 'model', 'report', 'date'])

In [60]:
pd.DataFrame(my_dict['report'][0])

Unnamed: 0,classifier,Fit Time,train_recall,train_f1,train_precision,train_accuracy,test_recall,test_f1,test_precision,test_accuracy
0,LogisticRegression,0 days 00:00:31.085787,0.882713,0.929025,0.980495,0.955302,0.740298,0.778817,0.821906,0.860675
1,DecisionTreeClassifier,0 days 00:00:43.942452,0.922179,0.916624,0.911952,0.944483,0.63999,0.624444,0.612665,0.745627
2,KNeighborsClassifier,0 days 00:00:24.852635,0.745343,0.740661,0.736137,0.827033,0.700154,0.687811,0.67624,0.789345
3,BernoulliNB,0 days 00:00:11.834530,0.729207,0.579243,0.480476,0.648956,0.710082,0.568863,0.474844,0.643826
4,RandomForestClassifier,0 days 00:01:22.419802,1.0,0.999821,0.999642,0.999881,0.621141,0.706785,0.822122,0.829756
