<a href="https://colab.research.google.com/github/Matonice/Advance_NLP/blob/main/doc_2_vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Read in data, clean it, and then split it into train and test split
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option("display.max_colwidth", 100)

messages = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None)
#messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
messages.columns = ["label", "text"]
messages["text_clean"] = messages["text"].apply(lambda x: gensim.utils.simple_preprocess(x))

x_train, x_test, y_train, y_test = train_test_split(messages["text_clean"], messages["label"], test_size=0.2)

In [None]:
# Create  tagged document objects to prepare to train the model
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(x_train)]

In [None]:
# Look at what a tagged document  looks like
tagged_docs[0]

TaggedDocument(words=['yup'], tags=[0])

In [None]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs,
                                  vector_size=100,
                                  window=5,
                                  min_count=2)

In [None]:
# What happens if we pass in a single word like we did for word2vec
d2v_model.infer_vector("text")

array([ 1.8518474e-03,  2.1497316e-03, -2.4290672e-03,  2.4674453e-03,
       -6.2719220e-04,  2.1419921e-03, -1.7251946e-03,  4.2563681e-03,
        2.2278498e-03,  1.4278871e-03, -2.1014360e-03,  4.7118580e-03,
        2.5756005e-04,  3.6186674e-03, -3.7917393e-04, -1.4625106e-03,
       -2.5535764e-03, -4.6995821e-04,  1.0829445e-04,  2.5812529e-03,
       -2.8033389e-04,  1.8003507e-03,  7.0399925e-04, -3.4532421e-03,
        2.6799222e-03, -2.7733147e-03,  2.7361556e-03,  1.5400619e-03,
       -2.3822826e-03, -1.7998698e-03, -1.6322393e-03,  2.1355383e-03,
        2.0502196e-03, -4.9602226e-03,  4.4020838e-03,  2.3983177e-03,
       -3.8313565e-03, -2.2714795e-03, -3.8779255e-03,  4.3577729e-03,
       -2.8859389e-03,  3.5219230e-03, -4.7811545e-03,  1.1761857e-03,
       -2.9107109e-03, -3.6207342e-03,  1.9478683e-03,  2.7054290e-03,
        1.5073792e-03, -2.1972891e-03, -2.3959656e-03,  3.0994962e-03,
       -2.4647608e-03,  7.8299950e-04,  1.0334685e-03,  3.9156321e-03,
      

In [None]:
# What happens if we pass in a list of words
d2v_model.infer_vector(["i", "am", "learning", "nlp"])

array([-0.00104915, -0.0089278 ,  0.0023408 , -0.01063483, -0.00659578,
       -0.00730282, -0.00071384,  0.0120982 , -0.0057387 , -0.00217667,
       -0.00240927, -0.00533625, -0.00453134,  0.00300744,  0.02129553,
        0.01185961,  0.00306265,  0.00728938,  0.00686021,  0.01844876,
       -0.00250026,  0.00410049,  0.0066724 , -0.01158634,  0.00297685,
        0.01375889, -0.00407913, -0.00662036,  0.00317311, -0.0126824 ,
        0.00093943, -0.00568859, -0.01369051, -0.00296576,  0.0026849 ,
       -0.00606743, -0.00844113, -0.003119  , -0.00484257, -0.01022323,
        0.00283848,  0.00800826,  0.00431864,  0.01590278, -0.0064741 ,
        0.0084478 , -0.00752619, -0.00284124, -0.00981975, -0.00931515,
        0.00586541,  0.00274791,  0.00526384,  0.00288979,  0.00912113,
       -0.00607544,  0.00721384, -0.00516292, -0.0119504 ,  0.00741274,
        0.00404627, -0.00786221, -0.00391567, -0.00185335,  0.0094073 ,
        0.00625611, -0.00714791, -0.0074401 ,  0.01368645, -0.00

## How To Prep Document Vectors for Modelling

In [None]:
# How do we prepare these vectors to be used in a machine learning model
vectors = [[d2v_model.infer_vector(words)] for words in x_test]

In [None]:
vectors[0]

[array([ 0.00456323, -0.02025962, -0.00342739, -0.00820782, -0.00820757,
        -0.00357211, -0.00438621,  0.0126257 , -0.012049  ,  0.00484461,
        -0.00420929, -0.00188905, -0.00089585, -0.00198555,  0.03423259,
         0.02560349, -0.00274939,  0.0030708 ,  0.01405196,  0.02541301,
        -0.00404888, -0.00457104,  0.01157917, -0.0272418 ,  0.01058479,
         0.01442741, -0.01086234, -0.01352994,  0.00705355, -0.01441785,
        -0.00094644, -0.00614414, -0.01695027,  0.00157501,  0.00399362,
        -0.01272759, -0.01255522, -0.01054734, -0.0143744 , -0.01803095,
         0.00848334,  0.0118285 ,  0.007517  ,  0.02649814, -0.00415461,
         0.01234586, -0.00925016, -0.01314956, -0.01244926, -0.00967544,
         0.01153818,  0.01436212,  0.00345156,  0.00322442,  0.00705212,
        -0.01480045,  0.01487788, -0.01321113, -0.01580823,  0.0210744 ,
         0.00877969, -0.0213322 , -0.01108041, -0.00013384,  0.01290506,
         0.00207823, -0.00853593, -0.00440719,  0.0