In [1]:
# Basic text analytics.
# Sila 18 November 2022
#
# From "Blueprints for Text Analytics by Albrecht, Ramachandran & Winkler"
# and
# Using Gensim for Text similiarity scores

In [2]:
import re

In [3]:
text = """
2019-08-10 23:32: @pete/@louis - I don't have a well-designed 
solution for today's problem. The code of module AC68 should be -1. 
Have to think a bit... #goodnight ;-) 😩😬"""

In [4]:
tokens = re.findall(r'\w\w+', text)
print(*tokens, sep='|')

2019|08|10|23|32|pete|louis|don|have|well|designed|solution|for|today|problem|The|code|of|module|AC68|should|be|Have|to|think|bit|goodnight


In [5]:
RE_TOKEN = re.compile(r"""
               ( [#]?[@\w'’\.\-\:]*\w     # words, hash tags and email adresses
               | [:;<]\-?[\)\(3]          # coarse pattern for basic text emojis
               | [\U0001F100-\U0001FFFF]  # coarse code range for unicode emojis
               )
               """, re.VERBOSE)

def tokenize(text):
    return RE_TOKEN.findall(text)

tokens = tokenize(text)
print(*tokens, sep='|')

2019-08-10|23:32|@pete|@louis|I|don't|have|a|well-designed|solution|for|today's|problem|The|code|of|module|AC68|should|be|-1|Have|to|think|a|bit|#goodnight|;-)|😩|😬


In [6]:
import nltk

nltk.download('punkt') ###
tokens = nltk.tokenize.word_tokenize(text)
print(*tokens, sep='|')

2019-08-10|23:32|:|@|pete/|@|louis|-|I|do|n't|have|a|well-designed|solution|for|today|'s|problem|.|The|code|of|module|AC68|should|be|-1|.|Have|to|think|a|bit|...|#|goodnight|;|-|)|😩😬


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mads\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# With SpaCy

In [9]:
import spacy
nlp = spacy.load('en_core_web_md')

In [10]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

In [11]:
for token in doc:
    print(token, end="|")

My|best|friend|Ryan|Peters|likes|fancy|adventure|games|.|

In [12]:
# Stop words

In [13]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


In [14]:
# Lemmas

In [15]:
print(*[t.lemma_ for t in doc], sep='|')

Dear|Ryan|,|we|need|to|sit|down|and|talk|.|regard|,|Pete


In [16]:
# Extracting Named Entities

In [17]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

In [18]:
# Using Gensim to make Predictions.
#
# Text similiarity scores

In [20]:
from gensim import corpora, models, similarities
import jieba

In [21]:
texts = ['I love reading Japanese novels. My favorite Japanese writer is Tanizaki Junichiro.',
         'Natsume Soseki is a well-known Japanese novelist and his Kokoro is a masterpiece.',
         'American modern poetry is good. ']

In [22]:
keyword = 'Japan has some great novelists. Who is your favorite Japanese writer?'

In [23]:
# jieba is a text segmentation Python module for cutting the words 
# into segmentations for easier analysis of text similarity in the future.)

In [24]:
texts = [jieba.lcut(text) for text in texts]
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Mads\AppData\Local\Temp\jieba.cache
Loading model cost 1.508 seconds.
Prefix dict has been built successfully.


In [25]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [26]:
tfidf = models.TfidfModel(corpus) 

In [27]:
kw_vector = dictionary.doc2bow(jieba.lcut(keyword))

In [28]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)

In [29]:
sim = index[tfidf[kw_vector]]

In [30]:
for i in range(len(sim)):
    print('keyword is similar to text%d: %.2f' % (i + 1, sim[i]))

keyword is similar to text1: 0.50
keyword is similar to text2: 0.02
keyword is similar to text3: 0.00


In [31]:
texts = ['Du er uddannet i kommunikation', 
'Du har mindst 3 års erfaring med e-mailmarketing fra et reklamebureau eller en større virksomhed', 
'Du er vant til at analysere resultaterne af din egen markedsføringsindsats']

In [32]:
keyword = 'Analyse af markedsførings resultater er lige dig'

In [33]:
texts = [jieba.lcut(text) for text in texts]
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id)

In [34]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [35]:
tfidf = models.TfidfModel(corpus) 

In [36]:
kw_vector = dictionary.doc2bow(jieba.lcut(keyword))

In [37]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)

In [38]:
sim = index[tfidf[kw_vector]]

In [39]:
for i in range(len(sim)):
    print('keyword is similar to text%d: %.2f' % (i + 1, sim[i]))

keyword is similar to text1: 0.05
keyword is similar to text2: 0.02
keyword is similar to text3: 0.47


In [51]:
texts = ['Du er god til konstruktion af modeller', 
'Er vant til at arbejde selvstændigt og med passion', 
'Du beskriver dig selv som en der er vant til at knokle og give alt for teamet']

In [52]:
keyword = 'Er vant til at knokle for teamet'

In [53]:
texts = [jieba.lcut(text) for text in texts]
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id)

In [54]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [55]:
tfidf = models.TfidfModel(corpus) 

In [56]:
kw_vector = dictionary.doc2bow(jieba.lcut(keyword))

In [57]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)

In [58]:
sim = index[tfidf[kw_vector]]

In [59]:
for i in range(len(sim)):
    print('keyword is similar to text%d: %.2f' % (i + 1, sim[i]))

keyword is similar to text1: 0.00
keyword is similar to text2: 0.23
keyword is similar to text3: 0.46
