In [14]:
# downloads
!pip install stanza gensim torchtext -qq

In [15]:
# conolidation

# imports
import torch
import stanza
from nltk import Tree
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors


stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-07-10 00:25:18 INFO: Downloaded file to C:\Users\Vrishank\stanza_resources\resources.json
2025-07-10 00:25:18 INFO: Downloading default packages for language: en (English) ...
2025-07-10 00:25:19 INFO: File exists: C:\Users\Vrishank\stanza_resources\en\default.zip
2025-07-10 00:25:23 INFO: Finished downloading models and saved to C:\Users\Vrishank\stanza_resources
2025-07-10 00:25:23 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-07-10 00:25:23 INFO: Downloaded file to C:\Users\Vrishank\stanza_resources\resources.json
2025-07-10 00:25:24 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2025-07-10 00:25:24 INFO: Using device: cpu
2025-07-10 00:25:24 INFO: Loading: tokenize
2025-07-10 00:25:24 INFO: Loading: mwt
2025-07-10 00:25:24 INFO: Loading: pos
2025-07-10 00:25:26 INFO: Loading: constituency
2025-07-10 00:25:27 INFO: Done loading processors!


In [16]:
doc = nlp("delete files with inode number specified by REGEX under current directory")
tree = Tree.fromstring(str(doc.sentences[0].constituency))
tree.pretty_print()

        ROOT                                                                          
         |                                                                             
         S                                                                            
         |                                                                             
         VP                                                                           
   ______|____                                                                         
  |           NP                                                                      
  |       ____|_______________                                                         
  |      |                    PP                                                      
  |      |     _______________|_______________                                         
  |      |    |                               NP                                      
  |      |    |           ____________

In [26]:
def get_node_phrases(tree, depth = 2):
    leaf_positions = tree.treepositions('leaves')
    word_to_phrase = []

    for pos in leaf_positions:
        try:
            if depth == 2: # two levels up
                parent_2 = pos[:-2] 
                phrase_subtree = tree[parent_2]
            elif depth == 1: # one level up 
                parent_1 = pos[:-1]
                phrase_subtree = tree[parent_1]
            phrase = " ".join(phrase_subtree.leaves())
            word = tree[pos]
            word_to_phrase.append(phrase)
        except IndexError:
            # Skip if not enough levels
            continue

    return word_to_phrase

In [29]:
phrases_data = get_node_phrases(tree)

for i, phrase in enumerate(phrases_data):
    print(f"Phrase {i+1}: '{phrase}'")

Phrase 1: 'delete files with inode number specified by REGEX under current directory'
Phrase 2: 'files'
Phrase 3: 'with inode number specified by REGEX under current directory'
Phrase 4: 'inode number'
Phrase 5: 'inode number'
Phrase 6: 'specified by REGEX under current directory'
Phrase 7: 'by REGEX'
Phrase 8: 'REGEX'
Phrase 9: 'under current directory'
Phrase 10: 'current directory'
Phrase 11: 'current directory'


In [30]:
phrases = []
for phrase in phrases_data:
    phrases.append(phrase)

phrases

['delete files with inode number specified by REGEX under current directory',
 'files',
 'with inode number specified by REGEX under current directory',
 'inode number',
 'inode number',
 'specified by REGEX under current directory',
 'by REGEX',
 'REGEX',
 'under current directory',
 'current directory',
 'current directory']

In [20]:
def preprocess(phrases):
    # remove phrases with more than 4 words
    for i, phrase in enumerate(phrases):
        if len(phrase.split()) > 4:
            del phrases[i]
            
    # remove duplicate phrases
    for i,phrase1 in enumerate(phrases):
        for j,phrase2 in enumerate(phrases):
            if j <= i:
                continue
            if phrase1 == phrase2:
                del phrases[j]
                
    # remove redundant phrases
    phrases = [
        p for p in phrases
        if not any(p != q and p in q for q in phrases)
    ]

    # add missing words in the right order
    words = []
    words_data = get_node_phrases(tree, depth=1)
    for phrase in words_data:
        words.append(phrase)

    
    phrase_tokens = {tuple(p.split()): p for p in phrases}
    result = []
    i = 0
    while i < len(words):
        for length in range(len(words), 0, -1):
            slice_ = tuple(words[i:i+length])
            if slice_ in phrase_tokens:
                result.append(phrase_tokens[slice_])
                i += length
                break
        else:
            result.append(words[i])
            i += 1
    return result
    

In [21]:
result = preprocess(phrases)

In [22]:
glove_input_file = "glove.6B.100d.txt"
word2vec_output_file = "glove.6B.100d.word2vec.txt"
glove2word2vec(glove_input_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format("glove.6B.100d.word2vec.txt", binary=False)

  glove2word2vec(glove_input_file, word2vec_output_file)


In [23]:
def get_embeddings(phrases):
    embeddings = torch.ones(1, 400)
    embedding_matrix = []
    for phrase in phrases:
        words = phrase.split()
        word_vectors = [torch.tensor(model[word]) for word in words if word in model]
        # averaging layer
        embeddings = torch.stack(word_vectors).nanmean(dim=0)
        embedding_matrix.append(embeddings)

    return embedding_matrix

In [24]:
embeddings = get_embeddings(result)
embeddings

[tensor([-0.9847, -0.3593,  0.4800,  0.0174, -0.0836,  0.3891, -0.5002, -0.0163,
          1.4212,  0.2648,  0.7294,  0.4001, -0.6046, -0.3532,  0.0976,  0.6482,
          0.1927, -0.0498, -0.2717,  0.2427,  0.0679,  0.0496,  0.5436,  0.4108,
         -0.0109, -0.6417, -0.5708,  0.2236,  0.1816, -0.4774,  0.8653,  0.8999,
         -0.8167,  0.2851, -0.4252, -0.0049, -0.5927, -0.8290, -0.0403, -0.2012,
         -0.0182,  0.1779, -0.1748, -0.4405, -0.6667,  0.5978, -0.4166,  0.0637,
         -0.4414, -0.6206,  0.7836,  1.2170, -0.7845, -0.3679, -0.8830, -0.1995,
          0.3578, -0.1070,  0.7295,  0.5112, -0.4249,  0.4940,  0.1179,  0.1765,
          0.8381, -0.1581,  1.0432,  0.4410, -0.1170, -0.2156,  0.0639,  0.3088,
          0.0833, -0.3197,  0.4735,  0.6192, -0.6173,  0.0967,  0.5988, -0.8721,
         -0.3186, -1.3572, -0.3145, -0.1944, -0.7842,  0.6458,  0.9027,  0.2160,
          0.1839,  0.5275, -0.5662,  0.8874,  0.3996, -0.4262, -0.0713, -0.9892,
          0.7064, -0.5807, -

'C:\\Users\\Vrishank\\NLC to Bash'