### Extracting data from XML

In [1]:
import xml.etree.ElementTree as ET
import html
import re

xml_file_path = "Data/Posts.xml"

tree = ET.parse(xml_file_path)
root = tree.getroot()

post_data = {}

rows = root.findall("row")
for row in rows:
    post = row.get("Body")
    
    # decodes any HTML entities.
    post = html.unescape(post)

    # re.sub(r'<[^>]+>', '', decoded_body) uses a regular expression to remove anything that looks like
    # an HTML tag. It matches any sequence of characters starting with < and ending with >, and removes it.
    post = re.sub(r'<[^>]+>', '', post)
    
    # Remove escape characters (like \n, \r, etc.) using regular expression
    post = re.sub(r'\\n|\\r|\\t', '', post)
    
    # Remove extra whitespace between sentences or lines
    post = re.sub(r'\s+', ' ', post).strip()

    # Remove punctuation
    post = re.sub(r'[^\w\s]', '', post)
    
    # Lower case all the letters
    post = post.lower()
    
    idx = int(row.get("Id"))
    
    post_data[idx-1] = post

# Example of proccesed data
for i in range(5):
    print(f"{i}: {post_data[i]}\n")

0: are there universities and labs that are unaffiliated with the tor project that are researching tor and onion routing and contributing back to the tor project be it in code security audits or just publishing papers about how tor could be made more secure if so what labs are out there and what concepts are they researching and where is there funding coming from if that information is available

1: tor browser bundle ships with noscript which can disable javascript but noscripts functionality is disabled this means that by default in the tor browser bundle all javascript code is allowed to execute  including potentially adversarial code what was the rationale behind this decision

2: are there any implications if whilst running a nonexit relay i also use the same machine to host a hidden service specifically how will this affect the anonymity of the hidden service in terms of its location being obscured what effect would it have on the anonymity of clients using my relay as a middle n

## Tokenization

In [47]:
import spacy
from collections import Counter

In [48]:
nlp = spacy.load("en_core_web_sm")
word_count = Counter()

for idx, post in post_data.items():
    doc = nlp(post)

    for token in doc:
        if not token.is_stop and not token.is_space and not token.like_num:
            word_count[token.text] += 1

### Saving word_count data

In [49]:
import pandas as pd

In [50]:
df = pd.DataFrame(list(word_count.items()), columns=['Word', 'Count'])
df.to_csv('word_count.csv', index=False)

In [51]:
# Path to your CSV file
file_path = 'word_count.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Convert the DataFrame to a dictionary
word_count = pd.Series(df.Count.values, index=df.Word).to_dict()

word_count = Counter(word_count)

word_count

Counter({'tor': 28774,
         'nt': 7901,
         'use': 7444,
         'browser': 6734,
         'notice': 5084,
         'network': 4676,
         'server': 3911,
         'connections': 3721,
         'like': 3664,
         'service': 3507,
         'address': 3433,
         'file': 3423,
         'node': 3361,
         'exit': 3321,
         'ip': 3193,
         'relay': 3158,
         'set': 3124,
         'proxy': 3060,
         'onion': 2826,
         'warn': 2761,
         'm': 2657,
         'traffic': 2605,
         'connection': 2551,
         'directory': 2533,
         'connect': 2509,
         'hidden': 2460,
         'circuit': 2460,
         'new': 2450,
         'running': 2446,
         'want': 2432,
         'client': 2411,
         'know': 2411,
         'need': 2400,
         'run': 2248,
         'time': 2247,
         'port': 2238,
         'way': 2149,
         'socks': 2058,
         'nodes': 1884,
         'bridge': 1838,
         'work': 1822,
         'fi

## Top 10 word

In [52]:
top_10 = word_count.most_common(10)
for word, count in top_10:
    print(f"{word}: {count}")

tor: 28774
nt: 7901
use: 7444
browser: 6734
notice: 5084
network: 4676
server: 3911
connections: 3721
like: 3664
service: 3507


## Word Distances

### Importing word distance algorithms

In [53]:
import sys
sys.path.append("16_word_distance_algorithms/")

In [54]:
import nltk
nltk.download('punkt_tab')
nltk.download("cmudict")

from bagDistance import BagDistance
from damerau import damerau_levenshtein_distance
from dice import dice_coefficient
from editex import editex_distance
from hamming import hamming_distance
from jaccard import jaccard_similarity
from jaro import jaro_distance
from jarowinkler import jaro_winkler_similarity
from lcs import lcs
from lcsubstring import LCSubStr
from levenshte import levenshtein_distance 
from overlapCoefficient import overlap_coefficient
from q_grams import find_common_qgrams
from Smith_Waterman import smith_waterman
from sylabble_allignment import align_syllables

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hassankalantari/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/hassankalantari/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


### Implemting a calculator funtion

In [57]:
import numpy as np

def most_simular(function):
    distances = {}
    for word, count in top_10:
        
        sim = {}
        c = 0
        for token in word_count.keys():
            if not isinstance(token, str):
                 continue
            sim[token] = function(word, token)
        
        keys = list(sim.keys())
        values = list(sim.values())
        
        max_indexes = np.argsort(values)
        
        for idx in max_indexes[::-1]:
            if keys[idx] != word:
                distances[word] = keys[idx]
                break
                       
    return distances

### Bag Distance

In [38]:
bag_distance = most_simular(BagDistance().get_sim_score)
bag_distance

### Jaro Distance

In [39]:
jaro_distances = most_simular(jaro_distance)
jaro_distances

{'tor': 'xtor',
 'nt': 'ntt',
 'use': 'fuse',
 'browser': 'browsers',
 'notice': 'noticed',
 'network': 'networks',
 'server': 'servers',
 'connections': 'connections3',
 'like': 'likes',
 'service': 'services'}

### Jaro Winkler Distance

In [40]:
jaro_winkler_distances = most_simular(jaro_winkler_similarity)
jaro_winkler_distances

{'tor': 'tore',
 'nt': 'ntt',
 'use': 'uses',
 'browser': 'broswer',
 'notice': 'noticed',
 'network': 'netowrk',
 'server': 'sever',
 'connections': 'ections',
 'like': 'liek',
 'service': 'servicei'}

### LCS Distance

In [43]:
lcs_distances = most_simular(lcs)
lcs_distances

{'tor': 'cusersmestore2',
 'nt': 'universities',
 'use': 'universities',
 'browser': 'browserdatatorgeoip6',
 'notice': 'torhttpsecurityubuntucomubuntupoolmainmmysql57mysqlservercore57_57230ubuntu018041_amd64deb',
 'network': 'httptechapplenet201610howtousekronymoustoaccesstornetworkonchromebrowsergooglechromeoschromebooks',
 'server': 'servicetorrun',
 'connections': 'connection_edge_process_relay_cell',
 'like': 'httpssecuritystackexchangecomquestions48502tortrafficcorrelationattacksbyglobaladversaries',
 'service': 'varlibtorhidden_servicehost'}

### LCS Substring Distance

In [46]:
lcs_subtring_distances = most_simular(LCSubStr)
lcs_subtring_distances

{'tor': 'httpresourcesinfosecinstitutecomfbitorexploit',
 'nt': 'addressmap_clear_transientvoid',
 'use': 'cuserstrucdocumentstordatator',
 'browser': 'httpstractorprojectorgprojectstorwikiorgteamscommunityteamsupportwhichplatformsistorbrowseravailablefor',
 'notice': '161419000notice',
 'network': 'prefnetworkhttppipeliningmaxrequests',
 'server': 'etcopenvpnserverconf',
 'connections': 'internetconnections',
 'like': 'cdnlike',
 'service': 'systemlibraryframeworksdirectoryserviceframeworkversionsadirectoryservice'}

Hassan Kalantari