In [13]:
import csv
import re
import re
import spacy

def might_be_title(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    # Heuristic rules
    num_tokens = len(doc)
    # num_proper_nouns = sum(token.pos_ == "PROPN" for token in doc)

    # num_nouns = sum(token.pos_ == "NOUN" for token in doc)
    # num_verbs = sum(token.pos_ == "VERB" for token in doc)
    # Check if the text contains a minimum number of proper nouns or nouns, and fewer verbs
    if (num_tokens >= 4):
        return True
    return False


def parse_references_regex(ref_list):
    pattern = r'\(\d{4}\)'

    titles = re.findall(pattern, ref_list)
    refs = [re.findall(r'(.*)\(\d{4}\)', i)[0]  for i in ref_list.split(",") if (len(re.findall(pattern, i))>=1)]
    return refs

input_csv = 'input.csv'
output_csv = 'output.csv'

# Read input CSV
with open(input_csv, 'r', newline='', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    paper_titles = []
    reference_lists = []

    for row in reader:
        paper_title, ref_list_str = row
        paper_titles.append(paper_title)
        reference_lists.append(parse_references_regex(ref_list_str))
new_reference_lists = []
for paper_title, ref_title in zip(paper_titles, reference_lists):
    print(paper_title)
    print(len(ref_title))
    new_ref_titles = []
    for ref in ref_title:
        if might_be_title(ref):
            new_ref_titles.append(ref)
    print(f'new{len(new_ref_titles)}')
    new_reference_lists.append(new_ref_titles)
# Write output CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)

    # Write paper titles as the first row
    writer.writerow(paper_titles)

    # Write reference titles row by row
    max_refs = max(len(ref_list) for ref_list in reference_lists)
    for row_idx in range(max_refs):
        row = []
        for ref_list in reference_lists:
            if row_idx < len(ref_list):
                row.append(ref_list[row_idx])
            else:
                row.append("")  # Empty cell if no more references in the list
        writer.writerow(row)


Sparsity and manifold regularized convolutional auto-encoders-based feature learning for fault detection of multivariate processes
54
new51
Convolutional Long Short-Term Memory Autoencoder-Based Feature Learning for Fault Detection in Industrial Processes
74
new67
Manifold regularized stacked autoencoders-based feature learning for fault detection in industrial processes
74
new70


In [6]:
import pandas as pd

# Read the CSV data
data = """
paper_1,paper_2,paper_3
Title A,Title B,Title A
Title B,Title C,Title C
Title C,Title A,Title B
"""

# Create a DataFrame from the CSV data
df = pd.read_csv('output.csv')

# Find the intersection of the paper titles in all three columns
common_papers = set(df.iloc[:,0])
for col in range(1, df.shape[1]):  # Iterate through the remaining columns
    common_papers.intersection_update(df.iloc[:, col])
# Print the common papers
print("Papers found in all three columns:")
for paper in common_papers:
    print(paper)


Papers found in all three columns:
 Imagenet classification with deep convolutional neural networks 
 A comparison study of basic data-driven fault diagnosis and process monitoring methods on the benchmark Tennessee Eastman process 
 Deep convolutional neural network model based chemical process fault diagnosis 
 
 A convolutional neural network for fault classification and diagnosis in semiconductor manufacturing processes 
 Process monitoring through manifold regularization-based GMM with global/local information 
 Kernel density estimation for an anomaly based intrusion detection system 
 Monitoring and diagnosing of mean shifts in multivariate manufacturing processes using two-level selective ensemble of learning vector quantization neural networks 
 A plant-wide industrial process control problem 


In [10]:


text = "The Catcher in the Rye"
print(might_be_title(text))  # True

text = "dog and cat playing together"
print(might_be_title(text))  # False


True
True


In [1]:
import spacy
from spacy.tokens import Doc
from spacy.language import Language

@Language.factory("sentence_tokenizer")
def create_sentence_tokenizer(nlp, name):
    return SentenceTokenizer(nlp)

class SentenceTokenizer:
    def __init__(self, nlp):
        self.vocab = nlp.vocab
        self.sentencizer = nlp.create_pipe("sentencizer")

    def __call__(self, text):
        doc = nlp.make_doc(text)
        doc = self.sentencizer(doc)
        sentence_tokens = [sent.text for sent in doc.sents]
        words = []
        spaces = []

        for sent in sentence_tokens:
            words.extend(sent.split())
            spaces.extend([True] * (len(sent.split()) - 1) + [False])

        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.blank("en")
nlp.remove_pipe("tokenizer")  # Remove the default tokenizer
nlp.add_pipe("sentence_tokenizer")

text = "This is a sentence. And here is another one. Let's tokenize them into sentences."

doc = nlp(text)
for token in doc:
    print(token)


ValueError: [E001] No component 'tokenizer' found in pipeline. Available names: []

In [45]:
import re
import spacy
from spacy.matcher import Matcher
from spacy.language import Language

@Language.component("custom_year_pattern")
def custom_year_pattern(doc):
    pattern = r'\(\d{4}\)'
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        doc.char_span(start, end, label="YEAR")
    return doc

def parse_references_spacy(ref_list):
    nlp = spacy.load("en_core_web_sm")

    nlp.add_pipe("custom_year_pattern", before="parser")

    matcher = Matcher(nlp.vocab)
    pattern = [{"IS_PUNCT": True, "OP": "?"}, {"POS": "PROPN", "OP": "*"}, {"IS_PUNCT": True, "OP": "?"}, {"IS_DIGIT": True, "OP": "?"}, {"IS_PUNCT": True, "OP": "?"}]
    matcher.add("TITLE", [pattern])

    doc = nlp(ref_list)
    matches = matcher(doc)
    titles = []

    for match_id, start, end in matches:
        span = doc[start:end]
        if span.text.startswith("(") :
            continue
        titles.append(span.text.strip())

    return titles

ref_list = "Abdeljaber, O., Avci, O., Kiranyaz, M.S., Boashash, B., Sodano, H., Inman, D.J., 1-D CNNs for structural damage detection: Verification on a structure health monitoring benchmark data (2018) Neurocomputing, 275, pp. 1308-1317; Arunthavanathan, R., Khan, F., Ahmed, S., Imtiaz, S., Rusli, R., Fault detection and diagnosis in process system using artificial intelligence-based cognitive technique (2020) Computers & Chemical Engineering, 134;"
titles = parse_references_spacy(ref_list)

for title in titles:
    print(title)


Abdeljaber
Abdeljaber,
,
, O.
, O.,
O.
O.,
,
, Avci
, Avci,
Avci
Avci,
,
, O.
, O.,
O.
O.,
,
, Kiranyaz
, Kiranyaz,
Kiranyaz
Kiranyaz,
,
, M.S.
, M.S.,
M.S.
M.S.,
,
, Boashash
, Boashash,
Boashash
Boashash,
,
, B.
, B.,
B.
B.,
,
, Sodano
, Sodano,
Sodano
Sodano,
,
, H.
, H.,
H.
H.,
,
, Inman
, Inman,
Inman
Inman,
,
, D.J.
, D.J.,
D.J.
D.J.,
,
, D.J., 1
, D.J., 1-
D.J., 1
D.J., 1-
, 1
, 1-
1
1-
-
:
2018
2018)
)
) Neurocomputing
) Neurocomputing,
Neurocomputing
Neurocomputing,
,
) Neurocomputing, 275
) Neurocomputing, 275,
Neurocomputing, 275
Neurocomputing, 275,
, 275
, 275,
275
275,
,
, pp
, pp.
pp
pp.
.
, pp. 1308
, pp. 1308-
pp. 1308
pp. 1308-
. 1308
. 1308-
1308
1308-
-
-1317
-1317;
1317
1317;
;
; Arunthavanathan
; Arunthavanathan,
Arunthavanathan
Arunthavanathan,
,
, R.
, R.,
R.
R.,
,
, Khan
, Khan,
Khan
Khan,
,
, F.
, F.,
F.
F.,
,
, Ahmed
, Ahmed,
Ahmed
Ahmed,
,
, S.
, S.,
S.
S.,
,
, Imtiaz
, Imtiaz,
Imtiaz
Imtiaz,
,
, S.
, S.,
S.
S.,
,
, Rusli
, Rusli,
Rusli
Rusli,
,
, R.
, R.,
R

In [20]:
import spacy

nlp = spacy.blank("en")
ruler = nlp.add_pipe("span_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)

doc = nlp("Apple is opening its first big office in   San Francisco.")
print([(span.text, span.label_) for span in doc.spans["ruler"]])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]


In [11]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Abdeljaber, O., Avci, O., Kiranyaz, M.S., Boashash, B., Sodano, H., Inman, D.J., 1-D CNNs for structural damage detection: Verification on a structure health monitoring benchmark data (2018) Neurocomputing, 275, pp. 1308-1317; Arunthavanathan, R., Khan, F., Ahmed, S., Imtiaz, S., Rusli, R., Fault detection and diagnosis in process system using artificial intelligence-based cognitive technique (2020) Computers & Chemical Engineering, 134;")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)


Abdeljaber Abdeljaber PROPN NNP ROOT Xxxxx True False
, , PUNCT , punct , False False
O. O. PROPN NNP appos X. False False
, , PUNCT , punct , False False
Avci Avci PROPN NNP nmod Xxxx True False
, , PUNCT , punct , False False
O. O. PROPN NNP appos X. False False
, , PUNCT , punct , False False
Kiranyaz Kiranyaz PROPN NNP conj Xxxxx True False
, , PUNCT , punct , False False
M.S. M.S. PROPN NNP conj X.X. False False
, , PUNCT , punct , False False
Boashash Boashash PROPN NNP conj Xxxxx True False
, , PUNCT , punct , False False
B. B. PROPN NNP conj X. False False
, , PUNCT , punct , False False
Sodano Sodano PROPN NNP conj Xxxxx True False
, , PUNCT , punct , False False
H. H. PROPN NNP appos X. False False
, , PUNCT , punct , False False
Inman Inman PROPN NNP nmod Xxxxx True False
, , PUNCT , punct , False False
D.J. D.J. PROPN NNP appos X.X. False False
, , PUNCT , punct , False False
1 1 NUM CD nummod d False False
- - PUNCT HYPH punct - False False
D d ADJ JJ compound X True False

In [56]:
from spacy.attrs import IS_TITLE
nlp = spacy.load("en_core_web_sm")
doc = nlp("1-D CNNs for structural damage detection: Verification on a structure health monitoring benchmark data (2018) Neurocomputing, 275, pp. 1308-1317; Arunthavanathan, R., Khan, F., Ahmed, S., Imtiaz, S., Rusli, R., Fault detection and diagnosis in process system using artificial intelligence-based cognitive technique")
doc[:].check_flag(IS_TITLE)

AttributeError: 'spacy.tokens.span.Span' object has no attribute 'check_flag'

In [1]:
import os
import ssl
import certifi
import spacy
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()

if not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None):
    ssl._create_default_https_context = ssl._create_unverified_context


In [6]:
import spacy
spacy.download('en_core_web_trf')

AttributeError: module 'spacy' has no attribute 'download'