In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

c = {
    'Lincoln1865': 'With malice toward none, with charity for all ...',
    'TrumpMay26': 'There is NO WAY (ZERO!) that Mail-In Ballots ...',
    'Wikipedia': 'In 1998, Oregon became the first state in the US ...',
    'FortuneMay26': 'Over the last two decades, about 0.00006% of total ...',
    'TheHillApr07': 'Trump voted by mail in the Florida primary.',
    'KingJamesBible': 'Wherefore laying aside all malice, and all guile, ...',
}

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(c.values())
matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=c.keys())

print(matrix)


                00006  1998  about  all  and  aside  ballots  became  by  \
Lincoln1865         0     0      0    1    0      0        0       0   0   
TrumpMay26          0     0      0    0    0      0        1       0   0   
Wikipedia           0     1      0    0    0      0        0       1   0   
FortuneMay26        1     0      1    0    0      0        0       0   0   
TheHillApr07        0     0      0    0    0      0        0       0   1   
KingJamesBible      0     0      0    2    1      1        0       0   0   

                charity  ...  total  toward  trump  two  us  voted  way  \
Lincoln1865           1  ...      0       1      0    0   0      0    0   
TrumpMay26            0  ...      0       0      0    0   0      0    1   
Wikipedia             0  ...      0       0      0    0   1      0    0   
FortuneMay26          0  ...      1       0      0    1   0      0    0   
TheHillApr07          0  ...      0       0      1    0   0      1    0   
KingJamesBible   

In [None]:
pip install spacy




In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(text):
    tokens = nlp(text)
    return [token.lemma_ for token in tokens if not token.is_punct and not token.is_space]

vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words='english')
X = vectorizer.fit_transform(c.values())
matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=c.keys())

print(matrix)




                0.00006  1998  aside  ballot  charity  decade  florida  guile  \
Lincoln1865           0     0      0       0        1       0        0      0   
TrumpMay26            0     0      0       1        0       0        0      0   
Wikipedia             0     1      0       0        0       0        0      0   
FortuneMay26          1     0      0       0        0       1        0      0   
TheHillApr07          0     0      0       0        0       0        1      0   
KingJamesBible        0     0      1       0        0       0        0      1   

                lay  mail  malice  oregon  primary  state  total  trump  vote  \
Lincoln1865       0     0       1       0        0      0      0      0     0   
TrumpMay26        0     1       0       0        0      0      0      0     0   
Wikipedia         0     0       0       1        0      1      0      0     0   
FortuneMay26      0     0       0       0        0      0      1      0     0   
TheHillApr07      0     1  



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(c.values())

print(pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=c.keys()))




                0.00006     1998     aside    ballot   charity   decade  \
Lincoln1865     0.00000  0.00000  0.000000  0.000000  0.773262  0.00000   
TrumpMay26      0.00000  0.00000  0.000000  0.521823  0.000000  0.00000   
Wikipedia       0.00000  0.57735  0.000000  0.000000  0.000000  0.00000   
FortuneMay26    0.57735  0.00000  0.000000  0.000000  0.000000  0.57735   
TheHillApr07    0.00000  0.00000  0.000000  0.000000  0.000000  0.00000   
KingJamesBible  0.00000  0.00000  0.462625  0.000000  0.000000  0.00000   

                 florida     guile       lay      mail    malice   oregon  \
Lincoln1865     0.000000  0.000000  0.000000  0.000000  0.634086  0.00000   
TrumpMay26      0.000000  0.000000  0.000000  0.427903  0.000000  0.00000   
Wikipedia       0.000000  0.000000  0.000000  0.000000  0.000000  0.57735   
FortuneMay26    0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
TheHillApr07    0.462625  0.000000  0.000000  0.379359  0.000000  0.00000   
KingJamesBib



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

malice_vector = matrix['malice'].values
vote_vector = matrix['vote'].values
mail_vector = matrix['mail'].values

cosine_malice_vote = cosine_similarity(malice_vector.reshape(1, -1), vote_vector.reshape(1, -1))
cosine_mail_vote = cosine_similarity(mail_vector.reshape(1, -1), vote_vector.reshape(1, -1))

print("Cosine similarity between 'malice' and 'vote':", cosine_malice_vote[0][0])
print("Cosine similarity between 'mail' and 'vote':", cosine_mail_vote[0][0])


Cosine similarity between 'malice' and 'vote': 0.0
Cosine similarity between 'mail' and 'vote': 0.7071067811865475
