In [14]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

df = pd.read_excel('savedrecs.xls')
print(df)

    Publication Type                Authors Book Authors  \
0                  J              Ökten, AI          NaN   
1                  J            Chalmers, S          NaN   
2                  J             Karakis, I          NaN   
3                  C    Abuduo, Z; Chen, LL          NaN   
4                  J  Syamili, C; Rekha, RV          NaN   
..               ...                    ...          ...   
995                J               LONG, JB          NaN   
996                J            [Anonymous]          NaN   
997                S                    NaN          NaN   
998                J             Nicolle, S          NaN   
999                J              Hambly, G          NaN   

                                          Book Editors Book Group Authors  \
0                                                  NaN                NaN   
1                                                  NaN                NaN   
2                                               

In [17]:
columns_to_select = ["Authors", "Book Authors", "Article Title", "Document Type", 
                     "Cited Reference Count","Publisher","Publisher City",
                     "Publication Year","Start Page","End Page","Abstract"]
filtered_df = df[columns_to_select]
filtered_df = filtered_df.dropna(subset=['Abstract'])
X = filtered_df['Abstract']
y = filtered_df['Document Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Convert text data to vectors
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [19]:
# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectors, y_train)

In [20]:
# Evaluate the classifier
y_pred = clf.predict(X_test_vectors)
print(classification_report(y_test, y_pred))  # target_names can be specified if you have them

                            precision    recall  f1-score   support

                   Article       0.78      1.00      0.87        76
     Article; Book Chapter       0.00      0.00      0.00         3
     Article; Early Access       0.00      0.00      0.00        10
Article; Proceedings Paper       0.00      0.00      0.00         1
         Proceedings Paper       0.00      0.00      0.00         6
                    Review       0.00      0.00      0.00         2

                  accuracy                           0.78        98
                 macro avg       0.13      0.17      0.15        98
              weighted avg       0.60      0.78      0.68        98



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: This is the ratio of true positive predictions to the total predicted positives. High precision relates to a low false positive rate. For the class "Article", you have a precision of 0.78, meaning that when the model predicts an "Article", it is correct about 78% of the time.

Recall: This is the ratio of true positives to the actual total positives. High recall means that the classifier is returning most of the relevant results. For "Article", the recall is 1.00, which indicates that the model identified all "Article" instances correctly.

F1-Score: This is the harmonic mean of precision and recall. An F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. For "Article", the F1 score is quite high at 0.87, which is good.

In [21]:
# Install TextBlob and its necessary corpora
!pip install textblob
!python -m textblob.download_corpora


Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting nltk>=3.8 (from textblob)
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: nltk, textblob
  Attempting uninstall: nltk
    Found existing installation: nltk 3.7
    Uninstalling nltk-3.7:
      Successfully uninstalled nltk-3.7
Successfully installed nltk-3.8.1 textblob-0.18.0.post0
[nltk_data] Downloading package brown to /Users/liam/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /Users/liam/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/liam/nltk_data...
[nltk_data] Downloa

In [23]:
import nltk
from nltk.corpus import wordnet

# Define the words to analyze
words = ["myth", "mythology", "Myth", "Mythology"]

# Initialize the lemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

for word in words:
    # Get the synsets (sets of synonyms) for the word
    synsets = wordnet.synsets(word)

    # Print the word and its synsets
    print(f"\nWord: {word}")
    for i, synset in enumerate(synsets):
        # Get the definition and example sentences for the synset
        definition = synset.definition()
        examples = synset.examples()

        print(f"Synset {i+1}: {synset.name()}")
        print(f"Definition: {definition}")
        if examples:
            print("Examples:")
            for example in examples:
                print(f"- {example}")

        # Get the lemma (root form) of the word
        lemma = lemmatizer.lemmatize(word, pos=synset.pos())
        print(f"Lemma: {lemma}")



Word: myth
Synset 1: myth.n.01
Definition: a traditional story accepted as history; serves to explain the world view of a people
Lemma: myth

Word: mythology
Synset 1: mythology.n.01
Definition: myths collectively; the body of stories associated with a culture or institution or person
Lemma: mythology
Synset 2: mythology.n.02
Definition: the study of myths
Lemma: mythology

Word: Myth
Synset 1: myth.n.01
Definition: a traditional story accepted as history; serves to explain the world view of a people
Lemma: Myth

Word: Mythology
Synset 1: mythology.n.01
Definition: myths collectively; the body of stories associated with a culture or institution or person
Lemma: Mythology
Synset 2: mythology.n.02
Definition: the study of myths
Lemma: Mythology
