In [7]:
from pymongo import MongoClient
import pandas as pd
import os
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

nlp = spacy.load("en_core_web_sm")

MONGO_URI = 'mongodb://datasci:scopus888@datascidb.kanakornmek.dev:27017/?authSource=admin'

# Connect to MongoDB
client = MongoClient(MONGO_URI)

db = client['datasci']
collection = db['papers']

data = []

batch_size = 1000
batch_count = 0
document_count = 0

cursor = collection.find().batch_size(batch_size)
for doc in cursor:
    title = doc['coredata'].get('dc:title')
    abstract = None
    if 'item' in doc and 'bibrecord' in doc['item'] and 'head' in doc['item']['bibrecord']:
        abstract = doc['item']['bibrecord']['head'].get('abstracts')
    else:
        abstract = doc['coredata']['dc:description']
    authkeywords = None
    if 'authkeywords' in doc and doc['authkeywords']:
        if isinstance(doc['authkeywords']['author-keyword'], list):
            authkeywords = " ".join([keyword['$'] for keyword in doc['authkeywords']['author-keyword']])
        else:
            authkeywords = [doc['authkeywords']['author-keyword']['$']] if doc['authkeywords']['author-keyword'] else None
    category = doc['subject-areas']['subject-area'][0].get('@abbrev') if 'subject-areas' in doc else None
    data.append({'title': title, 'abstract': abstract, 'authkeywords': authkeywords, 'category': category})
    
    document_count += 1
    
    if document_count % batch_size == 0:
        batch_count += 1
        print(f"Batch {batch_count} processed, {document_count} documents done.")

df = pd.DataFrame(data)
filename = os.path.join(os.getenv('USERPROFILE'), 'Documents', 'data.csv')
df.to_csv(filename, index=False)


Batch 1 processed, 1000 documents done.
Batch 2 processed, 2000 documents done.
Batch 3 processed, 3000 documents done.
Batch 4 processed, 4000 documents done.
Batch 5 processed, 5000 documents done.
Batch 6 processed, 6000 documents done.
Batch 7 processed, 7000 documents done.
Batch 8 processed, 8000 documents done.
Batch 9 processed, 9000 documents done.
Batch 10 processed, 10000 documents done.
Batch 11 processed, 11000 documents done.
Batch 12 processed, 12000 documents done.
Batch 13 processed, 13000 documents done.
Batch 14 processed, 14000 documents done.
Batch 15 processed, 15000 documents done.
Batch 16 processed, 16000 documents done.
Batch 17 processed, 17000 documents done.
Batch 18 processed, 18000 documents done.
Batch 19 processed, 19000 documents done.
Batch 20 processed, 20000 documents done.
Batch 21 processed, 21000 documents done.
Batch 22 processed, 22000 documents done.
Batch 23 processed, 23000 documents done.
Batch 24 processed, 24000 documents done.
Batch 25 p

In [10]:
df.shape

(39190, 4)

In [13]:
# Preprocess text function

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Combine and preprocess text fields
texts = df['title'].fillna('') + " " + df['abstract'].fillna('') + " " + df['authkeywords'].fillna('').apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
texts = texts.apply(preprocess_text)

df['texts'] = texts
filename = os.path.join(os.getenv('USERPROFILE'), 'Documents', 'data_preprocessed_1.csv')
df.to_csv(filename, index=False)

In [9]:
print(df.head())

                                               title  \
0  Public health and international epidemiology f...   
1  Flexible Printed Active Antenna for Digital Te...   
2  Parametric study of hydrogen production via so...   
3  Superhydrophobic coating from fluoroalkylsilan...   
4  Electrochemical impedance-based DNA sensor usi...   

                                            abstract  \
0                                               None   
1  © 2018 The Institute of Electronics, Informati...   
2  © 2018 Elsevier LtdComputational fluid dynamic...   
3  © 2018 Elsevier B.V. A superhydrophobic/supero...   
4  © 2018 Elsevier B.V. A label-free electrochemi...   

                                        authkeywords category  
0                                               None     MEDI  
1                                               None     ENGI  
2  Circulating fluidized bed Computational fluid ...     CHEM  
3  Encapsulation Fluoroalkylsilane Natural rubber...     CHEM  
4  acp