# Iron Man Word 2 Vec Model

In [183]:
import pandas as pd
import numpy as np
import gensim
import os
import re

In [184]:
df = pd.read_csv('Iron_Man-NLP_Dataset/iron man synopsis.csv')
df

Unnamed: 0,Comic name,Synopsis
0,Ultimate Iron Man isuue 1,Ultimate Iron Man¬†#1 reveals that Tony Stark's...
1,Ultimate Iron Man isuue 2,"In Ultimate Iron Man #2, young Tony Stark is s..."
2,Ultimate Iron Man isuue 3,"In issue #3, Howard arrives with a SWAT team a..."
3,Ultimate Iron Man isuue 4,"Ultimate Iron Man #4 follows Tony Stark, short..."
4,Ultimate Iron Man isuue 5,Ultimate Iron Man¬†#5 shows that shortly after ...
5,Ultimate Iron Man volume 2,Despite surviving being blown up in his protot...
6,Armor Wars: Stark Wars,After Iron Man finishes a training session in ...
7,Armor Wars: Glitch,"The Raiders invade an Air Force plane, drawing..."
8,Armor Wars: The Last Mandroid,Iron Man ruthlessly attacks the Beetle as he t...
9,Armor Wars: Who Guards the Guardsmen?,The Captain (which was an alias Rogers used af...


---
> ##  Text Preprocessing


> #### Removing # from corpus

In [185]:
def preprocess_txt(txt):
    pattern =re.compile("[^a-zA-Z]")
    return pattern.sub(" ",txt)

In [186]:
df['Synopsis']=df['Synopsis'].apply(func=preprocess_txt)
df['Synopsis']

0     Ultimate Iron Man    reveals that Tony Stark s...
1     In Ultimate Iron Man     young Tony Stark is s...
2     In issue     Howard arrives with a SWAT team a...
3     Ultimate Iron Man    follows Tony Stark  short...
4     Ultimate Iron Man    shows that shortly after ...
5     Despite surviving being blown up in his protot...
6     After Iron Man finishes a training session in ...
7     The Raiders invade an Air Force plane  drawing...
8     Iron Man ruthlessly attacks the Beetle as he t...
9     The Captain  which was an alias Rogers used af...
10    The West Coast Avengers arrives at Tony s home...
11    Test pilot Jack Taggert demonstrates a flight ...
12    It is revealed that Tony survived the explosio...
13    While flying under the radar for many fans  th...
14    Written by Dennis O Neil in       this rather ...
15    A pivotal story in the Iron Man mythos  Extrem...
16    Written by David Michelinie  the two issue sto...
17    The saga that would initiate something gre

> ### Because of this ‚¨ÜÔ∏èüëÜ I am getting more space between words So I am using thisüëá‚¨áÔ∏è method to overcome from this issue

In [187]:

df['Synopsis'] = df['Synopsis'].str.replace("       ", " ")
df.head()

Unnamed: 0,Comic name,Synopsis
0,Ultimate Iron Man isuue 1,Ultimate Iron Man reveals that Tony Stark s...
1,Ultimate Iron Man isuue 2,In Ultimate Iron Man young Tony Stark is s...
2,Ultimate Iron Man isuue 3,In issue Howard arrives with a SWAT team a...
3,Ultimate Iron Man isuue 4,Ultimate Iron Man follows Tony Stark short...
4,Ultimate Iron Man isuue 5,Ultimate Iron Man shows that shortly after ...


> #### Lower Casing

In [188]:
df['Synopsis']=df['Synopsis'].str.lower()

In [189]:
df

Unnamed: 0,Comic name,Synopsis
0,Ultimate Iron Man isuue 1,ultimate iron man reveals that tony stark s...
1,Ultimate Iron Man isuue 2,in ultimate iron man young tony stark is s...
2,Ultimate Iron Man isuue 3,in issue howard arrives with a swat team a...
3,Ultimate Iron Man isuue 4,ultimate iron man follows tony stark short...
4,Ultimate Iron Man isuue 5,ultimate iron man shows that shortly after ...
5,Ultimate Iron Man volume 2,despite surviving being blown up in his protot...
6,Armor Wars: Stark Wars,after iron man finishes a training session in ...
7,Armor Wars: Glitch,the raiders invade an air force plane drawing...
8,Armor Wars: The Last Mandroid,iron man ruthlessly attacks the beetle as he t...
9,Armor Wars: Who Guards the Guardsmen?,the captain which was an alias rogers used af...


> #### Removing Punctuations

In [190]:
import string
puncs =string.punctuation
print(f'all the pucntuations that python consider as punctuations - {puncs}')

all the pucntuations that python consider as punctuations - !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [191]:
def remove_punc(text):
    txt = text.translate(str.maketrans('','',puncs))
    return txt

In [192]:
df['Synopsis']=df['Synopsis'].apply(remove_punc)

In [193]:
df

Unnamed: 0,Comic name,Synopsis
0,Ultimate Iron Man isuue 1,ultimate iron man reveals that tony stark s...
1,Ultimate Iron Man isuue 2,in ultimate iron man young tony stark is s...
2,Ultimate Iron Man isuue 3,in issue howard arrives with a swat team a...
3,Ultimate Iron Man isuue 4,ultimate iron man follows tony stark short...
4,Ultimate Iron Man isuue 5,ultimate iron man shows that shortly after ...
5,Ultimate Iron Man volume 2,despite surviving being blown up in his protot...
6,Armor Wars: Stark Wars,after iron man finishes a training session in ...
7,Armor Wars: Glitch,the raiders invade an air force plane drawing...
8,Armor Wars: The Last Mandroid,iron man ruthlessly attacks the beetle as he t...
9,Armor Wars: Who Guards the Guardsmen?,the captain which was an alias rogers used af...


> #### Removing Stopwords

In [194]:
from nltk.corpus import stopwords

In [195]:
stopwords_lst = stopwords.words('english')

In [196]:
stopwords_lst

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [197]:
df['Synopsis']

0     ultimate iron man    reveals that tony stark s...
1     in ultimate iron man     young tony stark is s...
2     in issue     howard arrives with a swat team a...
3     ultimate iron man    follows tony stark  short...
4     ultimate iron man    shows that shortly after ...
5     despite surviving being blown up in his protot...
6     after iron man finishes a training session in ...
7     the raiders invade an air force plane  drawing...
8     iron man ruthlessly attacks the beetle as he t...
9     the captain  which was an alias rogers used af...
10    the west coast avengers arrives at tony s home...
11    test pilot jack taggert demonstrates a flight ...
12    it is revealed that tony survived the explosio...
13    while flying under the radar for many fans  th...
14    written by dennis o neil in this rather bleak ...
15    a pivotal story in the iron man mythos  extrem...
16    written by david michelinie  the two issue sto...
17    the saga that would initiate something gre

In [198]:
for i in df['Synopsis']:
    sentence = i.split()
    print(sentence)

['ultimate', 'iron', 'man', 'reveals', 'that', 'tony', 'stark', 's', 'genius', 'is', 'the', 'result', 'of', 'an', 'accident', 'his', 'mother', 'brilliant', 'scientist', 'maria', 'cerrera', 'who', 'is', 'the', 'second', 'wife', 'of', 'tony', 'stark', 's', 'father', 'howard', 'and', 'works', 'for', 'him', 'in', 'research', 'and', 'development', 'before', 'tony', 'was', 'born', 'suffered', 'while', 'she', 'was', 'carrying', 'him', 'in', 'her', 'womb', 'the', 'accident', 'changed', 'the', 'genetic', 'structure', 'of', 'both', 'her', 'and', 'her', 'unborn', 'child', 'but', 'culminated', 'in', 'her', 'death', 'during', 'childbirth', 'the', 'child', 'named', 'antonio', 'tony', 'for', 'short', 'developed', 'neural', 'tissue', 'normally', 'found', 'only', 'in', 'the', 'brain', 'all', 'throughout', 'his', 'body', 'causing', 'his', 'entire', 'body', 'to', 'act', 'as', 'one', 'massive', 'brain', 'giving', 'him', 'tremendous', 'mental', 'capacity', 'however', 'one', 'of', 'the', 'side', 'effects', 

In [199]:
for i in df['Synopsis']:
    sentence = i.split()
    for word in sentence:
        if word not in stopwords_lst:
            print(word)

            

ultimate
iron
man
reveals
tony
stark
genius
result
accident
mother
brilliant
scientist
maria
cerrera
second
wife
tony
stark
father
howard
works
research
development
tony
born
suffered
carrying
womb
accident
changed
genetic
structure
unborn
child
culminated
death
childbirth
child
named
antonio
tony
short
developed
neural
tissue
normally
found
brain
throughout
body
causing
entire
body
act
one
massive
brain
giving
tremendous
mental
capacity
however
one
side
effects
accident
minute
born
extreme
dermal
sensitivity
making
even
sensation
air
skin
feel
like
severe
burns
due
overstimulation
neural
cells
skin
howard
stark
master
inventor
owner
ceo
billion
dollar
tech
company
used
newly
invented
liquid
biological
armour
ease
tony
agony
buffer
allowing
interact
world
normally
one
would
wear
rest
life
despite
pain
endowed
regenerative
capabilities
due
mutation
caused
neural
cells
differentiate
body
allowing
completely
regenerate
whole
body
parts
necessary
biotechnology
armor
wears
constructed
genet

In [200]:
corpus = []

for sentence in df['Synopsis']:
    words = sentence.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_lst]
    filtered_sentence = ' '.join(filtered_words)
    corpus.append(filtered_sentence)


In [201]:
corpus

['ultimate iron man reveals tony stark genius result accident mother brilliant scientist maria cerrera second wife tony stark father howard works research development tony born suffered carrying womb accident changed genetic structure unborn child culminated death childbirth child named antonio tony short developed neural tissue normally found brain throughout body causing entire body act one massive brain giving tremendous mental capacity however one side effects accident minute born extreme dermal sensitivity making even sensation air skin feel like severe burns due overstimulation neural cells skin howard stark master inventor owner ceo billion dollar tech company used newly invented liquid biological armour ease tony agony buffer allowing interact world normally one would wear rest life despite pain endowed regenerative capabilities due mutation caused neural cells differentiate body allowing completely regenerate whole body parts necessary biotechnology armor wears constructed gen

### Simple Preprocessing

In [202]:
for list_num,corpus_ in enumerate(corpus):
    print(list_num,corpus_)

0 ultimate iron man reveals tony stark genius result accident mother brilliant scientist maria cerrera second wife tony stark father howard works research development tony born suffered carrying womb accident changed genetic structure unborn child culminated death childbirth child named antonio tony short developed neural tissue normally found brain throughout body causing entire body act one massive brain giving tremendous mental capacity however one side effects accident minute born extreme dermal sensitivity making even sensation air skin feel like severe burns due overstimulation neural cells skin howard stark master inventor owner ceo billion dollar tech company used newly invented liquid biological armour ease tony agony buffer allowing interact world normally one would wear rest life despite pain endowed regenerative capabilities due mutation caused neural cells differentiate body allowing completely regenerate whole body parts necessary biotechnology armor wears constructed gen

In [203]:
from gensim.utils import simple_preprocess
corpus_train = []
for doc in corpus:
    # print(doc)
    corpus_train.append(simple_preprocess(doc=doc))


In [204]:
corpus_train

[['ultimate',
  'iron',
  'man',
  'reveals',
  'tony',
  'stark',
  'genius',
  'result',
  'accident',
  'mother',
  'brilliant',
  'scientist',
  'maria',
  'cerrera',
  'second',
  'wife',
  'tony',
  'stark',
  'father',
  'howard',
  'works',
  'research',
  'development',
  'tony',
  'born',
  'suffered',
  'carrying',
  'womb',
  'accident',
  'changed',
  'genetic',
  'structure',
  'unborn',
  'child',
  'culminated',
  'death',
  'childbirth',
  'child',
  'named',
  'antonio',
  'tony',
  'short',
  'developed',
  'neural',
  'tissue',
  'normally',
  'found',
  'brain',
  'throughout',
  'body',
  'causing',
  'entire',
  'body',
  'act',
  'one',
  'massive',
  'brain',
  'giving',
  'tremendous',
  'mental',
  'capacity',
  'however',
  'one',
  'side',
  'effects',
  'accident',
  'minute',
  'born',
  'extreme',
  'dermal',
  'sensitivity',
  'making',
  'even',
  'sensation',
  'air',
  'skin',
  'feel',
  'like',
  'severe',
  'burns',
  'due',
  'overstimulation',
 

> ## *Model Training*

In [205]:
model = gensim.models.Word2Vec(
    window=20, # this means that in the target word I will have both side 15-15 words
    min_count=2, # my model will include only those words which have 2 letters
    vector_size=150 # this is output vector (the Features size that my model will create)
)

In [206]:
# Building vocabulary of unique words
model.build_vocab(corpus_train)

In [208]:
model.corpus_count # I have 22 lists

22

In [209]:
model.train(corpus,total_examples=model.corpus_count,epochs=500)

(0, 8614500)

In [210]:
model.wv.most_similar('tony')

[('bullies', 0.21625253558158875),
 ('strike', 0.2128761261701584),
 ('brain', 0.21076259016990662),
 ('robot', 0.19113215804100037),
 ('leads', 0.172602578997612),
 ('eventually', 0.16330061852931976),
 ('issue', 0.16205435991287231),
 ('poison', 0.16162188351154327),
 ('machine', 0.1587306708097458),
 ('general', 0.15527398884296417)]

In [211]:
model.wv.most_similar('stark')

[('sale', 0.249570831656456),
 ('traditional', 0.2346697598695755),
 ('student', 0.22436776757240295),
 ('maria', 0.22279521822929382),
 ('pain', 0.20276905596256256),
 ('pierce', 0.19829636812210083),
 ('two', 0.19731368124485016),
 ('stilt', 0.18197056651115417),
 ('also', 0.17066535353660583),
 ('dolores', 0.16753804683685303)]

In [212]:
model.wv.most_similar('iron')

[('home', 0.20444750785827637),
 ('forced', 0.20412953197956085),
 ('world', 0.2040136307477951),
 ('crowd', 0.19493207335472107),
 ('maria', 0.17859335243701935),
 ('blue', 0.1736636757850647),
 ('escapes', 0.17100517451763153),
 ('behind', 0.16551002860069275),
 ('technology', 0.15989360213279724),
 ('power', 0.15775124728679657)]

In [214]:
model.wv.most_similar('brain')

[('marvel', 0.2270006537437439),
 ('tissue', 0.21982981264591217),
 ('tony', 0.21076257526874542),
 ('washed', 0.20363619923591614),
 ('still', 0.1979152411222458),
 ('agony', 0.18144947290420532),
 ('follow', 0.17367137968540192),
 ('eventually', 0.17326225340366364),
 ('stories', 0.16857750713825226),
 ('imprisoned', 0.16826623678207397)]