In [11]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [8]:
ps = PorterStemmer()
ps.stem("consultant")
sw = stopwords.words()


In [127]:
c = "we are seeking an experienced frontend developer"
[ps.stem(w) for w in c.split()]



['we', 'are', 'seek', 'an', 'experienc', 'frontend', 'develop']

In [132]:
import string

def preprocess(s: str):
    return [ps.stem(w) for w in word_tokenize(s.lower())]

d1 = "DevOps is a methodology in the software development and IT industry. Used as a set of practices and tools, DevOps integrates and automates the work of software development and IT operations as a means for improving and shortening the systems development life cycle."
d2 = "In software engineering, continuous integration (CI) is a practice of merging all developers' working copies to a shared mainline several times a day. Nowadays it is typically implemented in such a way that it triggers an automated build with testing."
d3 = "Continuous delivery (CD) is a software engineering approach in which teams produce software in short cycles, ensuring that the software can be reliably released at any time and, following a pipeline through a production-like environment, without doing so manually."
s_d1 = set(word_tokenize(d1.lower()))
s_d2 = set(word_tokenize(d2.lower()))
s_d3 = set(word_tokenize(d3.lower()))

sw_set = set(stopwords.words())

r = s_d1.union(s_d2).union(s_d3)

r_s = {ps.stem(w) for w in r.difference(set(string.punctuation)).difference(sw_set)}

r_s


{'approach',
 'autom',
 'build',
 'cd',
 'continu',
 'copi',
 'cycl',
 'day',
 'deliveri',
 'develop',
 'devop',
 'engin',
 'ensur',
 'environ',
 'implement',
 'improv',
 'industri',
 'integr',
 'life',
 'mainlin',
 'manual',
 'mean',
 'merg',
 'methodolog',
 'nowaday',
 'oper',
 'pipelin',
 'practic',
 'produc',
 'production-lik',
 'releas',
 'reliabl',
 'set',
 'share',
 'short',
 'shorten',
 'softwar',
 'system',
 'team',
 'test',
 'time',
 'tool',
 'trigger',
 'typic',
 'work'}

In [105]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

s = []
lr_s = list(r_s)
lr_s.sort()

s.append([preprocess(d1).count(w) for w in lr_s])
s.append([preprocess(d2).count(w) for w in lr_s])
s.append([preprocess(d3).count(w) for w in lr_s])



df = pd.DataFrame(s, columns=lr_s)


In [106]:
print(df.to_markdown())


|    |   approach |   autom |   build |   cd |   continu |   copi |   cycl |   day |   deliveri |   develop |   devop |   engin |   ensur |   environ |   implement |   improv |   industri |   integr |   life |   mainlin |   manual |   mean |   merg |   methodolog |   nowaday |   oper |   pipelin |   practic |   produc |   production-lik |   releas |   reliabl |   set |   share |   short |   shorten |   softwar |   system |   team |   test |   time |   tool |   trigger |   typic |   work |
|---:|-----------:|--------:|--------:|-----:|----------:|-------:|-------:|------:|-----------:|----------:|--------:|--------:|--------:|----------:|------------:|---------:|-----------:|---------:|-------:|----------:|---------:|-------:|-------:|-------------:|----------:|-------:|----------:|----------:|---------:|-----------------:|---------:|----------:|------:|--------:|--------:|----------:|----------:|---------:|-------:|-------:|-------:|-------:|----------:|--------:|-------:|
|  0 |      

In [107]:
word_tokenize("production-like")


['production-like']

In [143]:
import numpy as np

# df = [
#     (w in preprocess(d1)) + (w in preprocess(d2)) + (w in preprocess(d3)) for w in lr_s
# ]
# df = np.array(df)
X = np.matrix(s)
df = np.array((X > 0).sum(0))[0]
N = 3


idf = np.log10(1 + (N / df))


tf = np.log10(X + 1)

tf_idf = np.multiply(tf, idf)

tf_idf_df = pd.DataFrame(tf_idf, columns=lr_s)
max_term = tf_idf_df.sum().sort_values()[-20:].sort_index().index

tf_idf_df.loc[2].sort_values(ascending=False)


approach          0.181238
ensur             0.181238
produc            0.181238
production-lik    0.181238
releas            0.181238
reliabl           0.181238
manual            0.181238
short             0.181238
softwar           0.181238
team              0.181238
environ           0.181238
deliveri          0.181238
cd                0.181238
pipelin           0.181238
engin             0.119792
time              0.119792
cycl              0.119792
continu           0.119792
test              0.000000
practic           0.000000
system            0.000000
shorten           0.000000
tool              0.000000
share             0.000000
set               0.000000
trigger           0.000000
typic             0.000000
merg              0.000000
oper              0.000000
improv            0.000000
build             0.000000
copi              0.000000
day               0.000000
develop           0.000000
devop             0.000000
implement         0.000000
industri          0.000000
n

In [133]:
cv = CountVectorizer(ngram_range=(2,2))

biX = cv.fit_transform([" ".join(preprocess(d1)), " ".join(preprocess(d2)), " ".join(preprocess(d3))])


In [135]:
bi_df = pd.DataFrame.sparse.from_spmatrix(biX, columns=cv.get_feature_names_out())


In [138]:
bi_df.sum().sort_values(ascending=False).head(20)


softwar engin      2
and it             2
develop and        2
the softwar        2
softwar develop    2
mean for           1
production lik     1
shorten the        1
lik environ        1
short cycl         1
share mainlin      1
sever time         1
set of             1
reliabl releas     1
releas at          1
mainlin sever      1
produc softwar     1
merg all           1
practic of         1
so manual          1
dtype: Sparse[int64, 0]

In [151]:
from sklearn.metrics.pairwise import cosine_similarity
import itertools

for x, y in itertools.combinations([0,1,2], r = 2 ):
    print((x,y))
    print(cosine_similarity([tf_idf_df.loc[x]], [tf_idf_df.loc[y]]))



(0, 1)
[[0.19018676]]
(0, 2)
[[0.07497723]]
(1, 2)
[[0.11882272]]
