In [None]:
# Basic NLP Practice with the Bible Text

# The model is trained with the Bible text(Genesis and Psalms)
# This practice was conducted to raise the understanding of NLP processing steps,and the score.

In [28]:
import pandas as pd
import nltk
import re
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [29]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

# Download the necessary tokenizer data
nltk.download('punkt_tab')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [30]:
import sys
!{sys.executable} -m spacy download en_core_web_sm

import spacy
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 16.9 MB/s eta 0:00:01
     ------------------------- -------------- 8.1/12.8 MB 24.1 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 24.2 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [112]:
# 1. Input data
import pandas as pd
import nltk
from nltk.corpus import gutenberg

# Load the Bible text form the Gutenberg corpus
bible = gutenberg.sents('bible-kjv.txt')

# Genesis and Psalms extraction
# No chapter or verse but partial extraction for demonstration

all_sents = [" ".join(s) for s in bible]
genesis_data = all_sents[0:500] 
psalms_data = all_sents[15800:16300]

df = pd.DataFrame({
    'text' : genesis_data + psalms_data,
    'label' : [0] * len(genesis_data) + [1] * len(psalms_data)
})

print(df.head())
print(f"데이터 타입 확인: {type(df['text'].iloc[0])}")

                                                text  label
0                           [ The King James Bible ]      0
1          The Old Testament of the King James Bible      0
2           The First Book of Moses : Called Genesis      0
3  1 : 1 In the beginning God created the heaven ...      0
4  1 : 2 And the earth was without form , and voi...      0
데이터 타입 확인: <class 'str'>


In [113]:
# 2. Preprocessing
import spacy
nlp = spacy.load("en_core_web_sm")

# def clean_text(text):
#     # if text is not a string, return empty string
#     if not isinstance(text, str):
#         return ""
    
#     doc = nlp(text.lower())
#     # Remove stopwords, punctuation, and non-alphabetic tokens
#     return " ".join([token.lemma_ for token in doc if token.is_alpha and not token.is_stop])

# df['cleaned'] = df['text'].apply(clean_text)

In [None]:
# 2-1. Preprocessing based on POS tagging
def clean_text(text):
    if not isinstance(text, str): return ""
    doc = nlp(text.lower())
    # Extract all except adpositional, determiners
    allowed_postags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN']
    return " ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags])

df['cleaned'] = df['text'].apply(clean_text)

In [115]:
# 3. Parsing
doc = nlp(df['cleaned'][1])
for token in doc:
    print(f"{token.text:{10}} -> {token.dep_:{10}} -> {token.head.text}")

old        -> amod       -> bible
testament  -> compound   -> king
king       -> compound   -> bible
james      -> compound   -> bible
bible      -> ROOT       -> bible


In [126]:
# 4. Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=2000, 
    ngram_range=(1, 3), # ngram is a feature that considers combinations of words
                        # it can capture context better than single words(unigram versus bigram or higher)
    min_df=2,           # ignore terms that appear in less than 1 documents
    max_df=0.9)         # ignore terms that appear in more than 90% of documents

X = vectorizer.fit_transform(df['cleaned'])
y = df['label']

In [130]:
# 5. Model Training (LinearSVC)
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify = y)

model = LinearSVC(
    C=1,          # Regularization parameter. The higher the value, the stricter the model.
    random_state=42
    )

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Genesis', 'Psalms']))

# Distribution/Test

def predict_book(new_text):
    cleaned = clean_text(new_text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)
    return "Genesis" if pred[0] == 0 else "Psalms"

print(predict_book("chaos was upon the face of the deep"))

              precision    recall  f1-score   support

     Genesis       0.94      0.90      0.92       125
      Psalms       0.90      0.94      0.92       125

    accuracy                           0.92       250
   macro avg       0.92      0.92      0.92       250
weighted avg       0.92      0.92      0.92       250

Genesis
