In [1]:
# Lowercasing
sentence = "My favourite cities in Ireland are Dublin,, Galway and Belfast"
sentence = sentence.lower()
print(sentence)

my favourite cities in ireland are dublin, galway and belfast


In [2]:
words = ['HellO','My','NAMe','is','BOB']
words = [word.lower() for word in words]
print(words)

['hello', 'my', 'name', 'is', 'bob']


In [3]:
# Noise removal
import re
def clean_words(text):
  # remove html markup
  text = re.sub("(<.*?>)","",text)
  #remove non-ascii and digits
  text = re.sub("(\W|\d+)"," ",text)
  #remove whitespace
  text = text.strip()
  return text

In [4]:
raw_text = ['...Hello','Hello!!','#Hello','>>>>Hello>>>','<a>Hello</a>']
clean_text = [clean_words(r) for r in raw_text]
print(clean_text)

['Hello', 'Hello', 'Hello', 'Hello', 'Hello']


In [5]:
# Stemming
import nltk
import pandas as pd
from nltk.stem import PorterStemmer as ps

stemmer = ps()
words = ['troubling','troubled','troubles']
stems = [stemmer.stem(word=word) for word in words]

df = pd.DataFrame({'Raw Word':words,'Stem':stems})
df

Unnamed: 0,Raw Word,Stem
0,troubling,troubl
1,troubled,troubl
2,troubles,troubl


In [6]:
# Lemmatization
from nltk.stem import WordNetLemmatizer as wnl
nltk.download('wordnet')

lemma = wnl()
lemmatized = [lemma.lemmatize(word = word, pos = 'v') for word in words]

df = pd.DataFrame({'Raw Word':words,'Lemma':lemmatized})
df = df[['Raw Word','Lemma']]
df

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,Raw Word,Lemma
0,troubling,trouble
1,troubled,trouble
2,troubles,trouble


In [7]:
nltk.download('punkt')
from nltk import word_tokenize

sentence = "Hi! my name is John."
sentence_re = re.sub("(\W|\d+)"," ",sentence)
tokens = word_tokenize(sentence_re)
tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Hi', 'my', 'name', 'is', 'John']

In [8]:
# Tokensise Sentences
from nltk import sent_tokenize
tokens = sent_tokenize(sentence)
tokens

['Hi!', 'my name is John.']

In [9]:
# Remove Stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

sentence = 'the weather today is really hot and i went for a walk'
tokens = word_tokenize(sentence)
tokens = [word for word in tokens if not word in stop_words]
print(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['weather', 'today', 'really', 'hot', 'went', 'walk']


In [10]:
# Word Embedding with Word2Vec
# Word2Vec is shallow NN (2 layers)
from gensim.models import Word2Vec as wtv

s1 = 'Bob Marley was a Regae singer'
s2 = 'He has been dead for several years'
s3 = 'Bob was a great artist'

# Tokenise words
sentences = [word_tokenize(s1),word_tokenize(s2),word_tokenize(s3)]
# Train the model (min_count = number of matching words returned)
model = wtv(sentences,min_count=1)
# Summarise model (vocab: Number of words submitted)
print('Model Summary:')
print(model)
print('\n')

# Words used in models
words = list(model.wv.vocab)
print('Corpus Vocabulary')
print(words)
print('\n')
print('Vector for word singer')
print(model['singer'])
print('\n')

# Test on 6 words similar to corpus
test = ['singer']
model.wv.most_similar(positive=test,topn=6)

### Loading and Saving ###
# model.save('whatever')
# model.load('whatever')

Model Summary:
Word2Vec(vocab=15, size=100, alpha=0.025)


Corpus Vocabulary
['Bob', 'Marley', 'was', 'a', 'Regae', 'singer', 'He', 'has', 'been', 'dead', 'for', 'several', 'years', 'great', 'artist']


Vector for word singer
[-5.1021599e-04  1.3014454e-03  1.6370032e-03 -2.9160082e-03
  1.4067261e-03  6.6956511e-04 -3.0644675e-04 -1.3621356e-03
 -2.5179225e-03  2.1225144e-03 -3.6723423e-03  4.5849211e-03
 -5.4021366e-05  4.7829011e-03  1.0308600e-03  1.7682988e-03
 -1.6815581e-03 -8.0540677e-04  4.8965057e-03  3.9657457e-03
  1.2069247e-03  8.8147691e-04 -4.4387332e-03  4.6534979e-04
 -8.4981561e-04  1.1791082e-03 -3.9849677e-03 -3.1178207e-03
  2.9399076e-03  2.3299740e-03  2.8398195e-05  3.2586372e-03
 -2.0716120e-03 -4.6158214e-03  1.3616442e-03  2.1065008e-03
 -2.5901163e-03  1.3368434e-03  3.1266096e-03  2.8227719e-03
  2.3954795e-03  4.2638988e-03 -4.1238433e-03  3.3572069e-03
 -2.8746082e-03 -4.8822318e-03  2.5473782e-03  1.2847480e-03
  8.7296830e-05 -3.1431606e-03 -1.5215476e

  if np.issubdtype(vec.dtype, np.int):


[('Bob', 0.17550605535507202),
 ('Marley', 0.09529964625835419),
 ('was', 0.07691202312707901),
 ('Regae', 0.05354451388120651),
 ('He', 0.043564509600400925),
 ('great', 0.038243331015110016)]

In [11]:
# Word Embedding with GloVe 

In [12]:
# Word Embedding Example with Word2Vec...
# Text8 file took long time to load