Word Embedding

In [1]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Preprocessing the text <br>
word_tokenize: tokenizes the text <br>
3rd line of the function is to remove the stopwords <br>
4th line of the function is to remove the punctuations <br>
5th line of the function is to remove the numbers <br>
6th line of the function is to remove the words with length less than 3

In [2]:
def preprocess(text):
# Tokenize the text
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]
    tokens = [token for token in tokens if not re.match(r'\d+', token)]
    tokens = [token for token in tokens if token.strip()]
    return tokens

Making the TF-DIF matrix of the first document

In [3]:
def TIFDIF(text):
    # Compute TF-IDF
    tokens = preprocess(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([' '.join(tokens)])

    # Access the computed TF-IDF scores
    feature_names = vectorizer.get_feature_names()
    data = []
    for col in tfidf_matrix.nonzero()[1]:
        data.append({'Token': feature_names[col], 'TF-IDF score': tfidf_matrix[0, col]})

    df = pd.DataFrame(data)
    print(df)

In [5]:
import nltk
import spacy

def preprocess_text(text):
    # Tokenize the text using NLTK
    tokens = nltk.word_tokenize(text)
    
    # Define commonly occurring phrases to be replaced
    phrase_mapping = {
        'New York City': '__place__',
        'John Smith': '__name__',
        # Add more commonly occurring phrases as needed
    }
    
    # Load the SpaCy English model
    nlp = spacy.load('en_core_web_sm', disable=['parser'])
    
    # Perform NER using SpaCy
    doc = nlp(text)
    
    preprocessed_tokens = []
    
    for token in tokens:
        # Replace commonly occurring phrases with a single word
        if token in phrase_mapping:
            preprocessed_tokens.append(phrase_mapping[token])
        # Replace proper nouns with tags
        else:
            is_proper_noun = any(ent.text == token and ent.label_ in ['PERSON', 'GPE'] for ent in doc.ents)
            if is_proper_noun:
                if token.istitle():
                    preprocessed_tokens.append('__name__')
                else:
                    preprocessed_tokens.append('__place__')
            else:
                preprocessed_tokens.append(token)
    
    # Join the preprocessed tokens back into a text
    preprocessed_text = ' '.join(preprocessed_tokens)
    
    return preprocessed_text


# Example usage
text = "John Smith is a software engineer from New York City. He works at XYZ Company."
preprocessed_text = preprocess_text(text)
print(preprocessed_text)


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [9]:
from os import listdir
path = path = "./Reliance_Dataset"
txts = [x for x in listdir(path) if x[-3:] == 'txt']
print(txts)

['500325_2012_cleantext.txt', '500325_2016_cleantext.txt', '500325_2011_cleantext.txt', '500325_2015_cleantext.txt', '500325_2014_cleantext.txt', '500325_2010_cleantext.txt', '500325_2017_cleantext.txt', '500325_2013_cleantext.txt']


In [12]:
corpus = ''
for f in txts:
    with open("Reliance_Dataset/" + f, 'r') as file:
        text = file.read()
        corpus += '\n' + text 

TIFDIF(corpus)



                Token  TF-IDF score
0         investigate      0.000610
1          committee1      0.000610
2              powers      0.001220
3      riskmanagement      0.000610
4      theperformance      0.000610
...               ...           ...
10171  forwardlooking      0.007318
10172          future      0.039031
10173             new      0.107944
10174           india      0.198203
10175      partnering      0.012807

[10176 rows x 2 columns]


Word2Vec Model of the first document

In [40]:
tokens = preprocess(corpus)
model = Word2Vec([tokens], vector_size=300, window=5, min_count=1, epochs = 20)
word_vectors = model.wv
print(model.wv.most_similar('collaborate'))

[('slowdown', 0.8093000650405884), ('East', 0.8064174056053162), ('Government', 0.8059264421463013), ('Asia', 0.8056067228317261), ('expected', 0.8050495386123657), ('need', 0.8050317764282227), ('globally', 0.8050126433372498), ('grow', 0.8047935366630554), ('cost', 0.804621696472168), ('Retail', 0.8043539524078369)]


In [41]:
print(model.wv.most_similar('collaboration'))

[('JCB', 0.23696570098400116), ('campaignis', 0.2305629700422287), ('disruptions', 0.20600491762161255), ('theBest', 0.20320232212543488), ('Circulation', 0.19682489335536957), ('Processing', 0.19563399255275726), ('succeedingmeeting', 0.1877049207687378), ('reinstallationof', 0.18250811100006104), ('premise', 0.17723533511161804), ('circle6Maximising', 0.17693930864334106)]


In [42]:
print(model.wv.most_similar('collaborative'))

[('Benzene', 0.6686267852783203), ('users', 0.6644120216369629), ('design', 0.6638323664665222), ('respectively', 0.6637520790100098), ('Infotel', 0.6636399626731873), ('complex', 0.6630818843841553), ('economies', 0.6627340912818909), ('earnings', 0.66253662109375), ('always', 0.662131130695343), ('rating', 0.6611295342445374)]


In [43]:
#print(model.wv.most_similar('cooperate')) # KeyError: "Key 'cooperate' not present"
print(model.wv.most_similar('cooperation')) 

[('coal', 0.4279607832431793), ('footwear', 0.4197697937488556), ('certain', 0.417576402425766), ('information', 0.4171597361564636), ('IndustrialRelations', 0.4156395494937897), ('efficient', 0.4138558506965637), ('effect', 0.41383904218673706), ('contract', 0.4135437607765198), ('oils', 0.41193899512290955), ('TV18', 0.41181012988090515)]


In [44]:
print(model.wv.most_similar('teamwork'))

[('Companymade', 0.20959284901618958), ('strategyimplementation', 0.20767642557621002), ('ofLondon', 0.19712801277637482), ('Relief', 0.18848437070846558), ('environmentfriendly', 0.18557246029376984), ('tocredit', 0.18368777632713318), ('innumerable', 0.1804037094116211), ('Squarethrough', 0.18008442223072052), ('VFA', 0.17502018809318542), ('Belt', 0.17446376383304596)]
