<a href="https://colab.research.google.com/github/Jaybhatt216/EIT/blob/main/tfid_d2v_use.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing (NLP)
* TextHero: TF-IDF
* Gensim: Doc2Vec
* TensorFlow2: Universal Sentence Encoder


In [7]:
!pip install texthero



In [8]:
#Import dependencies
import pandas as pd
import sqlite3
import texthero as hero
from texthero import preprocessing
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import tensorflow_hub as hub

In [10]:
#Establish connection to sqlite database
conn = sqlite3.connect("/content/AllPrintings.sqlite")

#load the data into a pandas DataFrame
df = pd.read_sql("SELECT * FROM cards", conn)

In [11]:
df = pd.read_sql("""SELECT DISTINCT name, text, convertedManaCost, power, toughness, keywords
                    FROM cards 
                    WHERE borderColor ='black' and colorIdentity = 'G'""", conn)

In [12]:
df.head(3)

Unnamed: 0,name,text,convertedManaCost,power,toughness,keywords
0,Abundance,"If you would draw a card, you may instead choo...",4.0,,,
1,Aggressive Urge,Target creature gets +1/+1 until end of turn.\...,2.0,,,
2,Avatar of Might,If an opponent controls at least four more cre...,8.0,8.0,8.0,Trample


## 1. TextHERO! ~ TFIDF

In [13]:
custom_pipeline = [preprocessing.fillna,
                   #preprocessing.lowercase,
                   #preprocessing.remove_digits,
                   #preprocessing.remove_punctuation(),
                   preprocessing.remove_whitespace,
                   #preprocessing.remove_stopwords(),
                   preprocessing.remove_diacritics
                   #preprocessing.remove_brackets,
                  ]

df['clean_text'] = hero.clean(df['text'], custom_pipeline)
df['clean_text'] = [n.replace('{','') for n in df['clean_text']]
df['clean_text'] = [n.replace('}','') for n in df['clean_text']]
df['clean_text'] = [n.replace('(','') for n in df['clean_text']]
df['clean_text'] = [n.replace(')','') for n in df['clean_text']]


In [14]:
# use inbuilt HERO tfidf vectorizer to convert clean_text to a vector
df['tfidf'] = (hero.tfidf(df['clean_text'], max_features=3000))

In [15]:
df.head(2)

Unnamed: 0,name,text,convertedManaCost,power,toughness,keywords,clean_text,tfidf
0,Abundance,"If you would draw a card, you may instead choo...",4.0,,,,"If you would draw a card, you may instead choo...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Aggressive Urge,Target creature gets +1/+1 until end of turn.\...,2.0,,,,Target creature gets +1/+1 until end of turn. ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## 2. GENSIM ~ Doc2Vec

In [16]:
#tokenize and tag the card text
card_docs = [TaggedDocument(doc.split(' '), [i]) 
             for i, doc in enumerate(df.clean_text)]
card_docs[:2]

[TaggedDocument(words=['If', 'you', 'would', 'draw', 'a', 'card,', 'you', 'may', 'instead', 'choose', 'land', 'or', 'nonland', 'and', 'reveal', 'cards', 'from', 'the', 'top', 'of', 'your', 'library', 'until', 'you', 'reveal', 'a', 'card', 'of', 'the', 'chosen', 'kind.', 'Put', 'that', 'card', 'into', 'your', 'hand', 'and', 'put', 'all', 'other', 'cards', 'revealed', 'this', 'way', 'on', 'the', 'bottom', 'of', 'your', 'library', 'in', 'any', 'order.'], tags=[0]),
 TaggedDocument(words=['Target', 'creature', 'gets', '+1/+1', 'until', 'end', 'of', 'turn.', 'Draw', 'a', 'card.'], tags=[1])]

In [17]:
#model = Doc2Vec(vector_size=64, min_count=1, epochs = 20)

#instantiate model
model = Doc2Vec(vector_size=64, window=2, min_count=1, workers=8, epochs = 40)

#build vocab
model.build_vocab(card_docs)

In [18]:
#train model
model.train(card_docs, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)

In [19]:
#generate vectors
card2vec = [model.infer_vector((df['clean_text'][i].split(' '))) 
            for i in range(0,len(df['clean_text']))]
card2vec[:2]

[array([-0.02799185, -0.0772426 , -0.36847344,  0.03060058, -0.45323434,
         0.33200637, -0.07691256, -0.21190536,  0.06758084, -0.20038837,
         0.14505054,  0.03811314,  0.24304587,  0.08914188, -0.29768044,
         0.2639503 , -0.4560689 ,  0.24659282,  0.42377472, -0.40364796,
        -0.29376355, -0.4097813 , -0.04198782, -0.20994939, -0.35303277,
        -0.334213  , -0.36243427,  0.24083842, -0.22348206, -0.04240599,
        -0.2914769 ,  0.08412161,  0.21807835,  0.26273862,  0.2569802 ,
         0.08058798,  0.7845506 ,  0.2785888 , -0.47091147, -0.36375743,
        -0.32170734,  0.5529662 , -0.09608271,  0.3226026 ,  0.49067643,
        -0.38274574, -0.37346497,  0.1722648 , -0.01381182, -0.25455004,
        -0.22309446, -0.01512285,  0.16126776,  0.13857964,  0.24432206,
        -0.74309367, -0.2021792 ,  0.40839067, -0.02102263,  0.6340034 ,
        -0.75021434,  0.5642243 ,  0.45907605, -0.17468336], dtype=float32),
 array([-0.19109099, -0.12576127, -0.1066934 , 

In [20]:
#Create a list of lists
dtv= np.array(card2vec).tolist()

In [21]:
#set list to dataframe column
df['card2vec'] = dtv

In [22]:
df.head(2)

Unnamed: 0,name,text,convertedManaCost,power,toughness,keywords,clean_text,tfidf,card2vec
0,Abundance,"If you would draw a card, you may instead choo...",4.0,,,,"If you would draw a card, you may instead choo...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.027991849929094315, -0.07724259793758392, ..."
1,Aggressive Urge,Target creature gets +1/+1 until end of turn.\...,2.0,,,,Target creature gets +1/+1 until end of turn. ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.19109098613262177, -0.1257612705230713, -0..."


## 3. TensorFlow2 ~ Universal Sentence Encoder

#download the model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")


#generate embeddings
embeddings = embed(df['clean_text'])

#create list from np arrays
use= np.array(embeddings).tolist()

#add lists as dataframe column
df['use'] = [v for v in use]

#check dataframe
df.head(2)

# T-HERO: TSNE

In [23]:
df['tsnetfidf'] = hero.tsne(df['tfidf'])
df['tsnec2v'] = hero.tsne(df['card2vec'])
#df['tsneuse'] = hero.tsne(df['use'])

In [24]:
#create scatter plot of tfidf
hero.scatterplot(df, col='tsnetfidf', color='convertedManaCost'
                 , title="TF-IDF", hover_data = ['name','text'])
#create scatter plot of doc2vec
hero.scatterplot(df, col='tsnec2v', color='convertedManaCost'
                 , title="Doc2Vec", hover_data = ['name','text'])
# #create scatter plot of uni. sent. enc.
# hero.scatterplot(df, col='tsneuse', color='convertedManaCost'
#                  , title="U.S.E", hover_data = 'name','text'])