In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora
from gensim.corpora import Dictionary
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim import models,utils
from gensim.models import TfidfModel



In [2]:
stop_words = stopwords.words("english")

In [3]:
df = pd.read_csv(r"Cyber Security data.csv",
                encoding="windows-1252")
stop_words+=[i.lower() for i in df["Country"].unique().tolist()]
#stop_words+=["cyber","secur"]

In [4]:
def Preprocess(text):
    contents = [re.sub('[^a-zA-Z]'," ",i) for i in text]
    contents =[[i.lower() for i in sent.split() if i.lower() not in stop_words and len(i)>3] for sent in contents]
    
    words = []
    for content in contents:
        words+=content
    
    return contents,words

In [5]:
contents,words = Preprocess(text = df["Sentence"])
tagged = [nltk.pos_tag(i) for i in contents]

In [6]:
stemmed_contents = [[PorterStemmer().stem(i[0]) for i in sent] for sent in tagged]

In [7]:
stemmed_words = [PorterStemmer().stem(i) for i in words]

# Removal

In [8]:
def word_removal(words_list,contents_list,most_common_num=None,least_num=None):
    fd = FreqDist(words_list).most_common()
    drops = [i[0] for i in fd[:most_common_num]]
    drops.append([i[0] for i in fd if i[1]>least_num])
    words2 = [[i for i in content if i not in drops] for content in contents_list]
    return words2

In [9]:
tw = [nltk.pos_tag(i) for i in words]

In [10]:
contents2 = word_removal(stemmed_words,stemmed_contents,5,10)

# TF-IDF

In [11]:
texts=contents2
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(line) for line in texts] 

In [12]:
tfidf = TfidfModel(corpus)
tfidfs = [tfidf.__getitem__(i) for i in corpus]
tfwords = [sorted(i,key=lambda x: x[1])[:3] for i in tfidfs]
tfwords = [[i[0] for i in sent] for sent in tfwords]
keys = dictionary.token2id.keys()
values = dictionary.token2id.values()
tfDic = dict()
for i,j in zip(values,keys):
    tfDic[i] = j
tfwords = [[tfDic[i] for i in tfword ] for tfword in tfwords]    

# LDA

In [13]:
texts=tfwords
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(line) for line in texts] 

In [14]:
%time lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, update_every=1, chunksize=10000, passes=1)

Wall time: 22.2 s


In [15]:
lda.print_topics(num_topics=10, num_words=5)

[(0,
  '0.079*"strategi" + 0.049*"support" + 0.043*"technolog" + 0.041*"implement" + 0.040*"commun"'),
 (1,
  '0.066*"polici" + 0.055*"plan" + 0.048*"action" + 0.048*"cybersecur" + 0.034*"strategi"'),
 (2,
  '0.105*"ensur" + 0.063*"level" + 0.063*"oper" + 0.061*"system" + 0.057*"govern"'),
 (3,
  '0.044*"strategi" + 0.034*"increas" + 0.032*"object" + 0.029*"capabl" + 0.027*"societi"'),
 (4,
  '0.071*"govern" + 0.067*"manag" + 0.062*"commun" + 0.054*"risk" + 0.040*"ministri"'),
 (5,
  '0.064*"activ" + 0.064*"technolog" + 0.029*"process" + 0.024*"econom" + 0.023*"research"'),
 (6,
  '0.104*"respons" + 0.075*"servic" + 0.059*"system" + 0.038*"sector" + 0.034*"public"'),
 (7,
  '0.142*"intern" + 0.076*"sector" + 0.065*"also" + 0.050*"privat" + 0.043*"public"'),
 (8,
  '0.079*"requir" + 0.060*"implement" + 0.055*"public" + 0.041*"system" + 0.033*"polici"'),
 (9,
  '0.101*"protect" + 0.070*"infrastructur" + 0.050*"threat" + 0.049*"network" + 0.046*"includ"')]

In [16]:
lda.save("UNlda")

In [17]:
topics = [sorted(lda.get_document_topics(corpus[i]),key=lambda x: x[1])[-1][0] for i in range(len(corpus))]

In [18]:
df["Topic"] = topics

In [20]:
df.columns

Index(['Unnamed: 0', 'Document', 'Page', 'Sentence', 'Country', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Topic'],
      dtype='object')

In [23]:
df1 = df.drop(["Unnamed: 0","Unnamed: 5","Unnamed: 6","Unnamed: 7","Unnamed: 8","Unnamed: 9"],axis=1)

In [25]:
df1.to_csv("LDA_topics.csv")

# Similarity

In [73]:
index = SparseMatrixSimilarity(lda[corpus],num_features=10)

In [71]:
from gensim.similarities import MatrixSimilarity,SparseMatrixSimilarity

In [74]:
vec_lda = [lda[i] for i in corpus]

In [75]:
sims = [index[i] for i in vec_lda]

In [98]:
ms = []

In [99]:
for j in range(5000):
    s = [i for i in range(36517) if sims[j][i]>0.99]
    ms.append(s)

In [100]:
for j in range(5000,10000):
    s = [i for i in range(j,36517) if sims[j][i]>0.99]
    ms.append(s)

In [101]:
for j in range(10000,15000):
    s = [i for i in range(j,36517) if sims[j][i]>0.99]
    ms.append(s)

In [102]:
for j in range(15000,20000):
    s = [i for i in range(j,36517) if sims[j][i]>0.99]
    ms.append(s)

In [103]:
for j in range(20000,25000):
    s = [i for i in range(j,36517) if sims[j][i]>0.99]
    ms.append(s)

In [104]:
for j in range(25000,30000):
    s = [i for i in range(j,36517) if sims[j][i]>0.99]
    ms.append(s)

In [105]:
for j in range(25000,36517):
    s = [i for i in range(j,36517) if sims[j][i]>0.99]
    ms.append(s)

In [109]:
pd.DataFrame(np.array(ms)).to_csv("SimilaritySetenceIndex.csv")