In [1]:
import en_core_web_lg
import numpy as np
import pandas as pd
import rake_nltk
from sklearn.cluster import DBSCAN

from library import find_topics, dbscan_predict

EPSILON = .2
MIN_SAMPLES = 2
PATH_REDDIT_NPY = 'data/reddit.npy'
PATH_NEW_TAGS_NPY = 'data/new_tags.npy'
WANT_TO_FIND_NEW_TOPICS = False


  from .autonotebook import tqdm as notebook_tqdm
  warn(
[nltk_data] Downloading package stopwords to /home/cela/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/cela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/cela/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
reddit = np.load(file=PATH_REDDIT_NPY, allow_pickle=True)
reddit = np.array([redd for redd in reddit[:1000] if isinstance(redd, str)])
reddit = np.array([''.join(char for char in s if char.isalnum() or char.isspace()) for s in reddit])
reddit = np.unique(reddit)
reddit


array(['\n\nView Pollhttpswwwredditcompoll10mhwc2',
       '\nA friend gave me his old modem router he got from the Middle East He said it might be locked to his ISP there but he wasnt sure I have a working modem router and thought maybe i could use the Huawei router\n\nWhen i plugged in the Huawei router it immediately showed a red LOS sign and when i hooked it up to the phone line it wouldnt connect to the internet\n\nA few questions\n\nHow can i know what the cause of the LOS signal\n\nHow do i know which router will be better for me\n\nThanks',
       '\nI am trying to implement a naive version of gradient descend determining the stepsize using line search\n\n    import numpy as np\n    import numpylinalg\n    import scipy\n\n\n    a  1\n    b  10\n\n    Rosenbrock  lambda x1 x2 a  x1  2  b  x2  x1  2  2\n\n\n    gradient  lambda x1 x2 nparray2a  x1  4x1x2  x12 2bT\n\n    def gradientdescendlinesearchgradient x0 y0 niter\n        visited  \n        x  nparrayx0y0T\n        visiteda

In [3]:
reddit = pd.DataFrame(reddit, columns=['text'])

if WANT_TO_FIND_NEW_TOPICS:
    new_tags = reddit['text'].apply(func=find_topics).values
    np.save(file=PATH_NEW_TAGS_NPY, arr=new_tags)
else:
    new_tags = np.load(file=PATH_NEW_TAGS_NPY, allow_pickle=True)

new_tags = list([list(tags) for tags in new_tags])


In [4]:
nlp = en_core_web_lg.load()
words = 'malware virus'
tokens = nlp(words)

print(tokens[0].text, tokens[1].text, tokens[0].similarity(tokens[1]))


malware virus 0.4815294146537781


In [5]:
#new_tags = np.array([x for x in new_tags if isinstance(x, list)])

new_tags = np.array([
    ''.join(char for char in ' '.join(s) if char.isalnum() or char.isspace()) for s in new_tags
])

#new_tags = np.unique(new_tags)


In [6]:
new_tags

array(['view pollhttpswwwredditcompoll10mhwc2', 'router huawei', 'x1 x2',
       'think application', 'placeholder table', 'ip interface',
       'benchmark heaven', 'day hour', 'tensorflow gpu', 'false positive',
       'im game', 'gaming fps', 'ip firewall', 'ip firewall',
       'node shortest', 'stream issue', 's4 disabled', 'good learning',
       'subreddit scrape', 'dfdate zero', 'question look', 'max removal',
       'setting way', 'year terraform', 'wondering window',
       'package data', 'angular port', 'noise drive', 'device visa',
       'root update', 'way closed', 'game issue', 'drive 64mb',
       'computer measure', 'phone use', 'nvidia panel', 'adaptive turn',
       'copper thermal', 'im wanting', 'pc wake', 'usage happens',
       'post device', 'data drive', 'key arrow', 'tlou yo',
       'num printamount', 'restore adb',
       'postinghttpswwwgovernmentjobscomcareersvisaliajobs3640567geographicinformationsystemsgisanalyst job',
       'algorithm code', 'router h

In [7]:
corpus = ' '.join(list(new_tags)).replace('-', ' ')
words = corpus.split()
corpus = " ".join(sorted(set(words), key=words.index))
tokens = nlp(corpus)
words_vectors = []
words_list = []

for token in tokens:
    words_vectors.append(token.vector)
    words_list.append(token.text)

words_vectors = np.array(words_vectors)
words_list = np.array(words_list)


In [11]:
dbscan = DBSCAN(eps=EPSILON, min_samples=MIN_SAMPLES, metric='cosine').fit(X=words_vectors)


In [20]:
words_to_test = ['ip', 'controller', 'screen', 'ubuntu']
test_words = ' '.join(words_to_test)
test_tokens = nlp(test_words)

test_vectors = []

for token in test_tokens:
    test_vectors.append(token.vector)

test_vectors = np.array(test_vectors)

print(f'Label for {words_to_test[0]}: {dbscan_predict(dbscan=dbscan, x=np.array([test_vectors[0]]))[0]}')
print(f'Label for {words_to_test[1]}: {dbscan_predict(dbscan=dbscan, x=np.array([test_vectors[1]]))[0]}')
print(f'Label for {words_to_test[2]}: {dbscan_predict(dbscan=dbscan, x=np.array([test_vectors[2]]))[0]}')
print(f'Label for {words_to_test[3]}: {dbscan_predict(dbscan=dbscan, x=np.array([test_vectors[3]]))[0]}')


Label for ip: 3
Label for controller: 4
Label for screen: 13
Label for ubuntu: 17


In [21]:
from collections import Counter

# Count occurrences of each element
counter = Counter(dbscan.labels_)

# Sort the counts in decreasing order
sorted_counts = dict(counter.most_common())

print(sorted_counts)


{-1: 874, 16: 8, 5: 6, 34: 6, 3: 4, 10: 4, 28: 4, 2: 3, 4: 3, 6: 3, 7: 3, 13: 3, 25: 3, 35: 3, 36: 3, 0: 2, 1: 2, 8: 2, 9: 2, 11: 2, 12: 2, 14: 2, 15: 2, 17: 2, 18: 2, 19: 2, 20: 2, 21: 2, 22: 2, 23: 2, 24: 2, 26: 2, 27: 2, 29: 2, 30: 2, 31: 2, 32: 2, 33: 2, 37: 2, 38: 2, 39: 2, 40: 2, 41: 2}


In [23]:
labels = dbscan.labels_
cluster_15_elements = words_list[labels == 3]
print(f"Elements in cluster 10:")
for s in cluster_15_elements:
    print(s)


Elements in cluster 10:
ip
dns
dhcp
ldap


In [25]:
# Create a Rake instance
r = rake_nltk.Rake()

# Text from which keywords will be extracted
text = reddit['text'][99]

print(text)

# Extract keywords from the text
r.extract_keywords_from_text(text)

# Get the ranked keywords
keywords = r.get_ranked_phrases_with_scores()

# Print the extracted keywords and their scores
for score, kw in keywords:
    print("Keyword:", kw, "Score:", score)


Can HTTPS websites be accessed over HTTP proxies For example if I go to the HTTPS login site of Reddit can I log in and browse securely via HTTPS through an HTTP proxy or only an HTTPS proxy

In either case can the proxy provider see my traffic and passwords in clear text if using an HTTPS site

Can 4G and 5G proxies be used on a desktop computer and in a desktop browser What would they be categorized as such as SOCKS HTTPS etc 

With 4G and 5G proxies can the proxy provider see my traffic and passwords in clear text if using an HTTPS site Can HTTPS sites be used
Keyword: browse securely via https Score: 14.5
Keyword: socks https etc Score: 8.5
Keyword: proxy provider see Score: 8.5
Keyword: proxy provider see Score: 8.5
Keyword: https login site Score: 7.833333333333334
Keyword: https proxy Score: 5.0
Keyword: https site Score: 4.833333333333334
Keyword: https site Score: 4.833333333333334
Keyword: https websites Score: 4.5
Keyword: https sites Score: 4.5
Keyword: http proxy Score: 4.