<a href="https://colab.research.google.com/github/Matonice/Advance_NLP/blob/main/Building_model_on_Doc_2_vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Read in data, clean it, and then split it into train and test split
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option("display.max_colwidth", 100)

messages = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None)
#messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
messages.columns = ["label", "text"]
messages["text_clean"] = messages["text"].apply(lambda x: gensim.utils.simple_preprocess(x))

x_train, x_test, y_train, y_test = train_test_split(messages["text_clean"], messages["label"], test_size=0.2)

In [3]:
# Create  tagged document objects to prepare to train the model
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(x_train)]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(x_test)]

In [4]:
# What do these TaggedDocument objects look like?
tagged_docs_train[:10]

[TaggedDocument(words=['long', 'after', 'quit', 'get', 'on', 'only', 'like', 'minutes', 'day', 'as', 'it', 'is'], tags=[0]),
 TaggedDocument(words=['the', 'monthly', 'amount', 'is', 'not', 'that', 'terrible', 'and', 'you', 'will', 'not', 'pay', 'anything', 'till', 'months', 'after', 'finishing', 'school'], tags=[1]),
 TaggedDocument(words=['miss', 'you', 'so', 'much', 'so', 'desparate', 'have', 'recorded', 'the', 'message', 'you', 'left', 'for', 'me', 'the', 'other', 'day', 'and', 'listen', 'to', 'it', 'just', 'to', 'hear', 'the', 'sound', 'of', 'your', 'voice', 'love', 'you'], tags=[2]),
 TaggedDocument(words=['congrats', 'year', 'special', 'cinema', 'pass', 'for', 'is', 'yours', 'call', 'now', 'suprman', 'matrix', 'starwars', 'etc', 'all', 'free', 'bx', 'ip', 'we', 'pm', 'dont', 'miss', 'out'], tags=[3]),
 TaggedDocument(words=['idc', 'get', 'over', 'here', 'you', 'are', 'not', 'weaseling', 'your', 'way', 'out', 'of', 'this', 'shit', 'twice', 'in', 'row'], tags=[4]),
 TaggedDocument(

In [5]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                  vector_size=100,
                                  window=5,
                                  min_count=2)

In [7]:
# Infer the vectors to be used in training and testing
train_vectors = [d2v_model.infer_vector(v.words) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(v.words) for v in tagged_docs_test]

## Fit RandomForestClassifier ON Top Of Document Vectors

In [8]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

rf = RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

y_pred = rf_model.predict(test_vectors)

precision = precision_score(y_test, y_pred, pos_label="spam")
recall = recall_score(y_test, y_pred, pos_label="spam")
print("precision: {} / Recall: {} / Accuracy: {}".format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)
))


precision: 0.9 / Recall: 0.061 / Accuracy: 0.874
