In [5]:
import spacy

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Task.1

In [3]:
# Corpus from the exercise
corpus = [
    "Lincoln1865: '...with malice toward none, with charity for all ...'",
    "Wilson1917: '...It must strive to run to finish the work we are in ...'",
    "'to do all which may achieve and cherish a just and lasting peace, '",
    "'among ourselves, and with all nations.'",
    "TrumpMay26: 'There is NO WAY (ZERO!) that Mail-In Ballots '",
    "'will be anything less than substantially fraudulent.'",
    "Wikipedia: 'In 1998, Oregon became the first state in the US '",
    "'to conduct all voting exclusively by mail.'",
    "FortuneMay26: 'Over the last two decades, about 0.00006% of total '",
    "'vote-by-mail votes cast were fraudulent.'",
    "TellAllPri07: 'Trump voted by mail in the Florida primary.'",
    "KingJamesBible: 'Wherefore laying aside all malice, and all guile, and '",
    "'hypocrisies, and envies, and all evil speakings,'"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Create the term-document matrix
X = vectorizer.fit_transform(corpus)

# Display the matrix as a data frame for better readability
import pandas as pd
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df)


    00006  1998  about  achieve  all  among  and  anything  are  aside  ...  \
0       0     0      0        0    1      0    0         0    0      0  ...   
1       0     0      0        0    0      0    0         0    1      0  ...   
2       0     0      0        1    1      0    2         0    0      0  ...   
3       0     0      0        0    1      1    1         0    0      0  ...   
4       0     0      0        0    0      0    0         0    0      0  ...   
5       0     0      0        0    0      0    0         1    0      0  ...   
6       0     1      0        0    0      0    0         0    0      0  ...   
7       0     0      0        0    1      0    0         0    0      0  ...   
8       1     0      1        0    0      0    0         0    0      0  ...   
9       0     0      0        0    0      0    0         0    0      0  ...   
10      0     0      0        0    0      0    0         0    0      0  ...   
11      0     0      0        0    2      0    2    

# Task.2

In [6]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Define the tokenizer function
def spacy_tokenizer(document):
    tokens = nlp(document)
    return [token.lemma_ for token in tokens if not token.is_punct]

# Initialize CountVectorizer with the spacy tokenizer
vectorizer_spacy = CountVectorizer(tokenizer=spacy_tokenizer)

# Create the term-document matrix
X_spacy = vectorizer_spacy.fit_transform(corpus)

# Display the matrix as a data frame for better readability
df_spacy = pd.DataFrame(X_spacy.toarray(), columns=vectorizer_spacy.get_feature_names_out())

print(df_spacy)


    0.00006  1998  a  about  achieve  all  among  and  anything  aside  ...  \
0         0     0  0      0        0    1      0    0         0      0  ...   
1         0     0  0      0        0    0      0    0         0      0  ...   
2         0     0  1      0        1    1      0    2         0      0  ...   
3         0     0  0      0        0    1      1    1         0      0  ...   
4         0     0  0      0        0    0      0    0         0      0  ...   
5         0     0  0      0        0    0      0    0         1      0  ...   
6         0     1  0      0        0    0      0    0         0      0  ...   
7         0     0  0      0        0    1      0    0         0      0  ...   
8         1     0  0      1        0    0      0    0         0      0  ...   
9         0     0  0      0        0    0      0    0         0      0  ...   
10        0     0  0      0        0    0      0    0         0      0  ...   
11        0     0  0      0        0    2      0    



# Task.3

In [8]:
# Perform LSA using TruncatedSVD
lsa_model = TruncatedSVD(n_components=3)
lsa_topic_matrix = lsa_model.fit_transform(X_spacy)

# Display the three-dimensional LSA representation of the documents
print(lsa_topic_matrix)


[[ 1.14123391 -0.37601529  0.30726399]
 [ 1.00679219  2.65079714 -0.77087247]
 [ 2.93162245 -0.47923397 -0.42757393]
 [ 1.43399476 -0.4368787   0.06409414]
 [ 0.32370801  1.27985684  1.26107408]
 [ 0.09229077  0.35483278  0.78481138]
 [ 0.6049887   2.86698755 -1.45847264]
 [ 0.97357338  0.4686548   0.97272987]
 [ 0.14931312  0.7518458  -0.7047208 ]
 [ 0.29554952  0.9511996   2.29279122]
 [ 0.44416077  1.71919247  1.24418365]
 [ 2.97395453 -0.9092797   0.03908033]
 [ 2.06781449 -0.62815435 -0.09649902]]


# Task.4

In [12]:
# Function to compute cosine similarity between two vectors
def cosine_angle(v1, v2):
    # Ensure the vectors are in the right shape (1, n_features)
    v1 = v1.reshape(1, -1)
    v2 = v2.reshape(1, -1)
    return cosine_similarity(v1, v2)[0][0]

# For the words, we need to use the components_ of the LSA model which represent the word vectors
# in the topic space (the "eigenwords" if you will)
word_vectors = lsa_model.components_

# Get the vector for the word 'malice' and 'vote' from the word_vectors
malice_vector = word_vectors[:, malice_idx].reshape(1, -1)
vote_vector = word_vectors[:, vote_idx].reshape(1, -1)

# Now we can compute the cosine similarity between the vectors of 'malice' and 'vote'
cosine_sim = cosine_angle(malice_vector, vote_vector)
print(f"Cosine similarity between 'malice' and 'vote': {cosine_sim}")


Cosine similarity between 'malice' and 'vote': 0.11076024369466277


# Task.5

In [None]:
# Term-document matrix is already computed and displayed in previous tasks.


# Task .6


In [16]:
# The word vectors are in the components_ of the LSA model
word_vectors_tfidf = lsa_tfidf_model.components_

# Get the vector for the word 'malice' and 'vote' from the word_vectors using TF-IDF
malice_vector_tfidf = word_vectors_tfidf[:, malice_tfidf_idx].reshape(1, -1)
vote_vector_tfidf = word_vectors_tfidf[:, vote_tfidf_idx].reshape(1, -1)

# Now we can compute the cosine similarity between the vectors of 'malice' and 'vote'
cosine_sim_tfidf = cosine_angle(malice_vector_tfidf, vote_vector_tfidf)
print(f"Cosine similarity between 'malice' and 'vote' using TF-IDF: {cosine_sim_tfidf}")


Cosine similarity between 'malice' and 'vote' using TF-IDF: -0.026296222635368946
