# NLP Fuzzy matching protype code

In [134]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.stem.porter import PorterStemmer

In [118]:
#Was going to use spark.. But don't need to due to small data size
spark = SparkSession.builder.appName("My NLP app").getOrCreate()
cols = ["product Family", "Product", "Platform"]

sdf = (spark.read.format("csv").option("header", True).load("/Users/maxnethercott/Documents/data_science_projects/nlp_sentence_matching/fuzzy_prototype/")
        .select([F.col(x).alias(x.lower().strip()) for x in cols]))


sdf.distinct().limit(10).show(1000,False)

21/12/24 20:21:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , , 
 Schema: Product Family, Product, Platform
Expected: Product Family but found: 
CSV file: file:///Users/maxnethercott/Documents/data_science_projects/nlp_sentence_matching/fuzzy_prototype/nlp_fuzzy_matcher.ipynb
21/12/24 20:21:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , , 
 Schema: Product Family, Product, Platform
Expected: Product Family but found: 
CSV file: file:///Users/maxnethercott/Documents/data_science_projects/nlp_sentence_matching/fuzzy_prototype/README.md


+------------------+----------------------------------------------------------------+-----------------------------------------------+
|product family    |product                                                         |platform                                       |
+------------------+----------------------------------------------------------------+-----------------------------------------------+
|Microsoft Office  |Microsoft SharePoint Server 2019                                |null                                           |
|Developer Tools   |Microsoft Visual Studio 2017 version 15.9 (includes 15.0 - 15.8)|null                                           |
|Browser           |Microsoft Edge (Chromium-based) in IE Mode                      |Windows 10 Version 1909 for ARM64-based Systems|
|Browser           |Microsoft Edge (Chromium-based) in IE Mode                      |Windows 11 for ARM64-based Systems             |
|Developer Tools   |Microsoft Visual Studio 2015 Update 3     

In [130]:
# Columns we are interested in
cols = ["Product", "Platform"]

#Read data
pdf = pd.read_csv("/Users/maxnethercott/Documents/data_science_projects/nlp_sentence_matching/fuzzy_prototype/ms_security_updates_csv.csv")
pdf = pdf[cols]
pdf.columns = pdf.columns.str.lower()
pdf = pdf.fillna("")

#Lower all pandas column values
pdf = pdf.apply(lambda x: x.astype(str).str.lower())

pdf['document'] = pdf[pdf.columns].agg('.'.join, axis=1)
pdf.drop_duplicates(inplace=True)


In [131]:
doc_list = pdf["document"].tolist()
tokenized_sents = [word_tokenize(i) for i in doc_list]
tokenized_sents

[['microsoft', 'sharepoint', 'server', '2019', '.'],
 ['microsoft',
  'sharepoint',
  'enterprise',
  'server',
  '2013',
  'service',
  'pack',
  '1',
  '.'],
 ['microsoft', 'sharepoint', 'enterprise', 'server', '2016', '.'],
 ['microsoft', 'edge', '(', 'chromium-based', ')', '.'],
 ['microsoft', 'biztalk', 'esb', 'toolkit', '2.3', '.'],
 ['microsoft', 'biztalk', 'esb', 'toolkit', '2.4', '.'],
 ['microsoft', 'biztalk', 'esb', 'toolkit', '2.2', '.'],
 ['visual', 'studio', 'code', '.'],
 ['office', 'app', '.'],
 ['microsoft', '4k', 'wireless', 'display', 'adapter', '.'],
 ['powershell', '7.2', '.'],
 ['windows',
  'server',
  '2012',
  'r2',
  '(',
  'server',
  'core',
  'installation',
  ')',
  '.'],
 ['windows', 'server', '2012', 'r2', '.'],
 ['windows',
  'server',
  '2012',
  '(',
  'server',
  'core',
  'installation',
  ')',
  '.'],
 ['windows', 'server', '2012', '.'],
 ['windows',
  'server',
  '2008',
  'r2',
  'for',
  'x64-based',
  'systems',
  'service',
  'pack',
  '1',
  

In [178]:
#Filter out stopwords and punctuation...
stop_words = set(nltk.corpus.stopwords.words("english"))

out = [[w for w in x if not w in list(stop_words)] for x in tokenized_sents]
out = [[s.translate(str.maketrans("", "", string.punctuation)) for s in x] for x in out]

porter = PorterStemmer()
out = [[porter.stem(word) for word in x if word != '']  for x in out]
out


[['microsoft', 'sharepoint', 'server', '2019'],
 ['microsoft',
  'sharepoint',
  'enterpris',
  'server',
  '2013',
  'servic',
  'pack',
  '1'],
 ['microsoft', 'sharepoint', 'enterpris', 'server', '2016'],
 ['microsoft', 'edg', 'chromiumbas'],
 ['microsoft', 'biztalk', 'esb', 'toolkit', '23'],
 ['microsoft', 'biztalk', 'esb', 'toolkit', '24'],
 ['microsoft', 'biztalk', 'esb', 'toolkit', '22'],
 ['visual', 'studio', 'code'],
 ['offic', 'app'],
 ['microsoft', '4k', 'wireless', 'display', 'adapt'],
 ['powershel', '72'],
 ['window', 'server', '2012', 'r2', 'server', 'core', 'instal'],
 ['window', 'server', '2012', 'r2'],
 ['window', 'server', '2012', 'server', 'core', 'instal'],
 ['window', 'server', '2012'],
 ['window',
  'server',
  '2008',
  'r2',
  'x64base',
  'system',
  'servic',
  'pack',
  '1',
  'server',
  'core',
  'instal'],
 ['window',
  'server',
  '2008',
  'r2',
  'x64base',
  'system',
  'servic',
  'pack',
  '1'],
 ['window',
  'server',
  '2008',
  'x64base',
  'system

In [195]:
detokenized_sents = [TreebankWordDetokenizer().detokenize(i) for i in out]
data = list(set(detokenized_sents))
len(data)

151

In [206]:
# Ready for model build

def output_sentences(most_similar):
    for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(most_similar)//2), ('LEAST', len(most_similar) - 1)]:
      print(u'%s %s: %s\n' % (label, most_similar[index][1], data[int(most_similar[index][0])]))

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
max_epochs = 50
vec_size = 150
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                min_count=1,
                dm =0,
                window =1,
                workers=4,
                sample = 0)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
Model Saved


In [209]:
model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("microsoft edge".lower())
test_data = [porter.stem(word) for word in test_data]
v1 = model.infer_vector(test_data)
#print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.dv.most_similar([v1])
print(similar_doc)

# to print similar sentences
output_sentences(similar_doc) 

# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
#print(model.dv['1'])

[('12', 0.9257362484931946), ('125', 0.8480272889137268), ('103', 0.7311438918113708), ('4', 0.7274013757705688), ('37', 0.725104808807373), ('98', 0.7224041819572449), ('15', 0.7214416861534119), ('7', 0.7189545035362244), ('84', 0.7174950242042542), ('73', 0.7162351608276367)]
MOST 0.9257362484931946: microsoft edg chromiumbas

SECOND-MOST 0.8480272889137268: microsoft edg io

MEDIAN 0.7224041819572449: microsoft edg chromiumbas ie modewindow 11 x64base system

LEAST 0.7162351608276367: microsoft sharepoint server 2019

