# Imports

In [63]:
# nltk is a usefull library for text processing practice
import nltk                             
from nltk.corpus import twitter_samples

from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer 

In [64]:
import pandas as pd
import numpy as np
import pickle
import pickle
import re        

In [65]:

from plotly.offline import iplot
import plotly.graph_objects as go

In [66]:
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors

In [67]:
# Download examples we'll use in our excercise
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\mklos\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [68]:
# select the set of positive and negative tweets
tweets_p = twitter_samples.strings('positive_tweets.json')
tweets_n = twitter_samples.strings('negative_tweets.json')

In [69]:
## Join negative and positive tweets into one list + check sample size
all_tweets = tweets_p + tweets_n
len(all_tweets)

10000

# Text processing

## Remove unwanted characters

- Remove Twitter specific characters such as hashtags and hyperlinks
- Strip any html present (not always the best solutions, but overall recommended to start with)
- Lower case only (especially important in simple word tokenization)

In [70]:
tweet_with_hash_an_url = "FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week, see more at https://www.somerandompage.com"

In [71]:
# remove hyperlinks
tweet_mod = re.sub(r'https?://[^\s\n\r]+', '', tweet_with_hash_an_url)

In [72]:
tweet_mod 

'FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week, see more at '

In [73]:
# remove hashtag
tweet_mod = re.sub(r'#', '', tweet_mod)

In [74]:
tweet_mod

'FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week, see more at '

## Apply same process to our tweets

In [75]:
def clean_tweet(tweet):
    #Create a function, which will remove urls, hashtags and uppercase
    # remove hyperlinks
    tweet_mod = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtag
    tweet_mod = re.sub(r'#', '', tweet_mod)
    # remove uppercase
    tweet_mod = tweet_mod.lower()
    return tweet_mod

In [76]:
clean_tweet(tweet_with_hash_an_url)

'followfriday @france_inte @pkuchly57 @milipol_paris for being top engaged members in my community this week, see more at '

In [77]:
# Create list of cleaned twits
all_tweets_clean = [clean_tweet(tweets) for tweets in all_tweets]

In [78]:
all_tweets_clean[2]

'@despiteofficial we had a listen last night :) as you bleed is an amazing track. when are you in scotland?!'

## Regex practice

- Extract all names preceeded by "@"
- Extract numeric values only
- extract smily and sad faces

Use any online materials - recommmended REGEX 101 or even CHATGPT

In [79]:
re_text1 = " @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members"

In [80]:
re_text1 = re.findall(r'@\w+', re_text1)
re_text1

['@France_Inte', '@PKuchly57', '@Milipol_Paris']

In [81]:
re_text2 = '@lamb2ja hey james! how odd :/ please call our contact centre on 02392441234 and we will be able to assist you :) many thanks! within 24 hours'

In [82]:
print(re.findall(r'@\w+', re_text2))
# extract numeric values only
print(re.findall(r'\d+', re_text2))
# extract smily and sad faces
print(re.findall(r'[:;=8][\'\-]?[)dDpP/\\]', re_text2))

['@lamb2ja']
['2', '02392441234', '24']
[':/', ':)']


In [83]:
re_text3 = '@despiteofficial we had a listen last night :) as you bleed is an amazing track. why is the weather so bad:('

In [84]:
print(re.findall(r'@\w+', re_text3))
# extract numeric values only
print(re.findall(r'\d+', re_text3))
# extract smily and sad faces
print(re.findall(r'[:;=8][\'\-]?[())dDpP/\\]', re_text3))

['@despiteofficial']
[]
[':)', ':(']


# Basic tokenization

In [85]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)

In [86]:
tweet_tokens = tokenizer.tokenize(tweet_mod)

In [87]:
print(tweet_tokens)

['followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ',', 'see', 'more', 'at']


## Stemming

Generalize words to reduce vocabulary size.

Consider the words with same root: 
 * **learn**
 * **learn**ing
 * **learn**ed
 * **learn**t
 

In [88]:
# Let's use build in PorterStemmer
stemmer = PorterStemmer()

In [89]:
tweet_stemmed = [] 
for word in tweet_tokens:
    stem_word = stemmer.stem(word)
    tweet_stemmed.append(stem_word)

In [90]:
print(tweet_stemmed)

['followfriday', 'for', 'be', 'top', 'engag', 'member', 'in', 'my', 'commun', 'thi', 'week', ',', 'see', 'more', 'at']


# Create function converting cleaned tweets to tokenized

In [91]:
def tokenize_tweet(clean_tweet):
    # Function to tokenize and stemm a tweet
    tweet_tokens = tokenizer.tokenize(clean_tweet)
    
    tweet_stemmed = [] 
    for word in tweet_tokens:
        stem_word = stemmer.stem(word)
        tweet_stemmed.append(stem_word)
        
    return tweet_stemmed    

In [92]:
tokenize_tweet(all_tweets_clean[0])

['followfriday',
 'for',
 'be',
 'top',
 'engag',
 'member',
 'in',
 'my',
 'commun',
 'thi',
 'week',
 ':)']

# More advanced cases of tokenization

Let's experiment with tokenizer used by BERT

In [93]:
from transformers import BertConfig, BertModel

# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# Initializing a model (with random weights) from the bert-base-uncased style configuration
model = BertModel(configuration)

# Accessing the model configuration
configuration = model.config

In [94]:
from transformers import BertTokenizer, BertModel


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [95]:
tweet_mod

'FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week, see more at '

In [96]:
text = tweet_mod

In [97]:
encoded_input = tokenizer(text, return_tensors='pt')

In [98]:
encoded_input

{'input_ids': tensor([[  101,  3582, 27439,  4710,  1030,  2605,  1035, 20014,  2063,  1030,
          1052,  5283,  2818,  2135, 28311,  1030, 23689, 11514,  4747,  1035,
          3000,  2005,  2108,  2327,  5117,  2372,  1999,  2026,  2451,  2023,
          2733,  1010,  2156,  2062,  2012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Chceck tokenizer vocal size

Analize vocab values and tokens, reverse vocab dictionary to match tokens to words

We can find all word-token pairs in tokenizer.vocab.items() stored as tupples. Let's create a reversed dict, where we will be able to backengineer each token to corresponding characters

In [99]:
tokenizer_vocab_inverted = {v: k for k, v in tokenizer.vocab.items()}
tokenizer_vocab_inverted

{0: '[PAD]',
 1: '[unused0]',
 2: '[unused1]',
 3: '[unused2]',
 4: '[unused3]',
 5: '[unused4]',
 6: '[unused5]',
 7: '[unused6]',
 8: '[unused7]',
 9: '[unused8]',
 10: '[unused9]',
 11: '[unused10]',
 12: '[unused11]',
 13: '[unused12]',
 14: '[unused13]',
 15: '[unused14]',
 16: '[unused15]',
 17: '[unused16]',
 18: '[unused17]',
 19: '[unused18]',
 20: '[unused19]',
 21: '[unused20]',
 22: '[unused21]',
 23: '[unused22]',
 24: '[unused23]',
 25: '[unused24]',
 26: '[unused25]',
 27: '[unused26]',
 28: '[unused27]',
 29: '[unused28]',
 30: '[unused29]',
 31: '[unused30]',
 32: '[unused31]',
 33: '[unused32]',
 34: '[unused33]',
 35: '[unused34]',
 36: '[unused35]',
 37: '[unused36]',
 38: '[unused37]',
 39: '[unused38]',
 40: '[unused39]',
 41: '[unused40]',
 42: '[unused41]',
 43: '[unused42]',
 44: '[unused43]',
 45: '[unused44]',
 46: '[unused45]',
 47: '[unused46]',
 48: '[unused47]',
 49: '[unused48]',
 50: '[unused49]',
 51: '[unused50]',
 52: '[unused51]',
 53: '[unused52]',

You can also use tokenizer.decode([token]) but I wanted to show a use case of reversed dictionary

In [100]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")['input_ids']

In [101]:
# To easily access token index you need to convert tensor to a numpy value
inputs[0].detach().numpy()

array([  101,  7592,  1010,  2026,  3899,  2003, 10140,   102],
      dtype=int64)

## Backengineer tokenization process|

In [102]:
def tokenization_words(text, tokenizer):
    # Prepare function, which tokenizes text and then returns corresponding words
    inputs = tokenizer(text, return_tensors="pt")['input_ids']
    # To easily access token index you need to convert tensor to a numpy value
    inputs = inputs[0].detach().numpy()
    token_strings = []
    for token in inputs:
        token_strings.append(tokenizer_vocab_inverted[token])        
    return token_strings

In [103]:
text = 'Hello, my dog is cute'

In [104]:
tokenization_words(text, tokenizer)

['[CLS]', 'hello', ',', 'my', 'dog', 'is', 'cute', '[SEP]']

In [105]:
text2 = "I misspelled my dog while at the vet"

In [106]:
tokenization_words(text2, tokenizer)

['[CLS]',
 'i',
 'miss',
 '##pel',
 '##led',
 'my',
 'dog',
 'while',
 'at',
 'the',
 'vet',
 '[SEP]']

## Experiment with tokenizing a few sentences

- Find a word, which is is tokenized only as syllables
- Find 5 words, which are tokenized as 2+ tokens, do you see any pattern here?

In [107]:
# find a word which is tokenized only as syllables
text3 = "I'm going to the gym"

tokenization_words(text3, tokenizer)

['[CLS]', 'i', "'", 'm', 'going', 'to', 'the', 'gym', '[SEP]']

In [108]:
# find 5 words which are tokenized as 2+ tokens
text4 = "I'm going to the gym and I'm going to the gym and I'm going to the gym and I'm going to the gym and I'm going to the gym"

# Word vectorization

In [109]:
word_embeddings = pickle.load(open( "../data/word_embeddings_subset.p", "rb" ) )

Check how many word embeddings are present in our sample

In [110]:
len(word_embeddings.keys() )

243

What is the vectors dimension?

In [111]:
word_embeddings["Poland"].shape

(300,)

## Dimensionality reduction - TSNE 

We would like to see these vectors in 3D Space, let's experiment with TSNE.

- First we need to create an array containing all vectors
- Then reduce dimensionality with TSNE
- And proceed to visualization

In [112]:
# Convert embedding values to an array
X = np.array(list(word_embeddings.values()))

In [113]:
## Use sklearn documentation to use TSNE and create X_red with only 3 dimensions
X_red = TSNE(n_components=3).fit_transform(X)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [114]:
X_red.shape

(243, 3)

Create df_vec contatinig x,y,z cols from X_red and label column from word_embeddings 
key corresponding to each vector

In [115]:
df_vec = pd.DataFrame(X_red, columns=["x", "y", "z"])
df_vec["label"] = list(word_embeddings.keys())

In [116]:
df_vec.describe()

Unnamed: 0,x,y,z
count,243.0,243.0,243.0
mean,1.453487,0.398368,2.953462
std,80.681328,76.183403,79.88504
min,-179.722076,-176.132294,-219.288666
25%,-60.668325,-56.491772,-56.454943
50%,-0.511608,2.805398,6.262187
75%,65.944195,53.766659,64.209078
max,180.603394,210.655243,209.873138


In [118]:
df_vec

Unnamed: 0,x,y,z,label
0,23.170794,-72.723221,-16.859102,country
1,47.041634,-42.537167,-62.741261,city
2,-44.388664,55.519707,52.716415,China
3,-109.540710,-34.910995,6.262187,Iraq
4,117.644707,-2.166332,-219.288666,oil
...,...,...,...,...
238,44.178104,149.039810,82.158379,Belmopan
239,-2.589705,-13.427938,-135.881256,Vaduz
240,108.017075,77.422714,-86.838181,Paramaribo
241,14.591696,-142.675858,-85.018692,Nuuk


Use provided wireframe to create a 3D plot

In [119]:

trace0=go.Scatter3d(
        x = df_vec.x,
        y = df_vec.y,
        z = df_vec.z,
        mode="markers",
        text = df_vec.label,
        )



datas=[trace0]

figure=go.Figure(
    data=datas,
    layout=go.Layout(
      
        scene=dict(
            xaxis=dict(title="x"),
            yaxis=dict(title="y"),
            zaxis=dict(title="z"),           
                  ),
    ))


name = '3D word embeddings'

camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=0, y=-1, z=1)
)

figure.update_layout(scene_camera=camera, title=name)#

iplot(figure)

## Show coutry - capital vectors

Let's explore relations between a set of country-capital pairs

In [120]:
country_capitals_paris = [("France", "Paris"), ("England", "London"),("Mali", "Bamako"), 
                          ("Poland", "Warsaw"), ("Italy", "Madrid"), ("Kenya", "Nairobi")]

In [121]:
data=[]
for pair in country_capitals_paris:
    capital = pair[0]
    country = pair[1]
    
    df_pair = df_vec.loc[df_vec.label.apply(lambda x: x in [capital, country])]
                                            
    trace=go.Scatter3d(
        x =df_pair.x,
        y=df_pair.y,
        z=df_pair.z,
        mode="markers+lines",
        text = df_pair.label
        )
    
    data.append(trace)

In [122]:
df_pair

Unnamed: 0,x,y,z,label
47,127.978882,-61.731838,6.300955,Kenya
119,75.184807,-85.827286,-98.619896,Nairobi


In [124]:
figure=go.Figure(
    data=data,
    layout=go.Layout(
      
        scene=dict(
            xaxis=dict(title="x"),
            yaxis=dict(title="y"),
            zaxis=dict(title="z")
            

           
                  ),

    ))


name = ''
# Default parameters which are used when `layout.scene.camera` is not provided
camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=0, y=-1, z=1)
)

figure.update_layout(scene_camera=camera, title=name)#

iplot(figure)

## Predict capital of Spain

No we will use words vectorizations to predict country's capital

Let's go to the original 300D X so that we don't loose information and see if we can predict the capital of any country

In [125]:
# Create country capital vector by substracting two vectors e.g. France and Paris
country_capital_vector = word_embeddings["France"] - word_embeddings["Paris"]

In [126]:
country_capital_vector

array([ 0.06713867, -0.05322266,  0.3305359 , -0.16333008, -0.14941406,
       -0.06420898,  0.16503906, -0.10351562,  0.02050781, -0.14660645,
        0.16796875, -0.18652344, -0.1796875 ,  0.03125   , -0.05566406,
        0.17138672, -0.01757812,  0.11474609, -0.00292969,  0.04443359,
        0.2680664 ,  0.09423828, -0.08349609, -0.17480469, -0.15344238,
       -0.01171875, -0.12109375, -0.2618103 , -0.0135498 ,  0.1895752 ,
       -0.09082031,  0.08300781,  0.12695312, -0.03466797,  0.28051758,
       -0.04589844, -0.07275391, -0.02441406,  0.10644531,  0.00256348,
        0.12664795,  0.03417969,  0.00585938, -0.05664062,  0.09716797,
        0.01367188, -0.07421875,  0.13110352, -0.07397461, -0.23925781,
       -0.18359375, -0.11157227, -0.13232422,  0.2841797 ,  0.13671875,
       -0.01269531, -0.26367188,  0.05566406,  0.03320312,  0.0987854 ,
       -0.03271484,  0.02832031,  0.01867676, -0.14746094, -0.00878906,
       -0.02520752,  0.24090576, -0.14257812,  0.02935791,  0.02

In [127]:
## Calculate expected vector for England's capital based on previously calculated country_capital_vector
expected_capital_vector = word_embeddings["England"] - country_capital_vector

## Leverage nearest neighbors search to find corresponding vector

Searching similar neighbors in 300D vector space can be tricky and brute-force calculations will cost a lot of computation. We can use sklearn NearestNeighbors algorithm

In [135]:
# Fit nearest neighbors based on X, set n_neighbors to 3
nearest_neighbors = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(X)

In [136]:
# Find distance and idx of 3 nearest neighbors for expected capital vector
dist, idx = nearest_neighbors.kneighbors([expected_capital_vector])

In [137]:
idx[0]

array([ 8,  7, 26], dtype=int64)

In [138]:
dist

array([[2.36614335, 2.49261504, 3.01510667]])

In [139]:
#Let's see if any of nearest neighbors are correct
df_vec.label.loc[idx[0]]

8     England
7      London
26      Paris
Name: label, dtype: object

Create a function, which will predict given countrys capital

In [140]:
def get_capital(country, df_vec):
    """Create a function, which returns predicted capital for any given country
    Word embeddings and country_capital_vector can be treated as global params and do not need 
    to be included in function args for simplicity
    """
    expected_capital_vector = word_embeddings[country] - country_capital_vector
    nearest_neighbors = NearestNeighbors(n_neighbors=3).fit(X)
    dist, idx = nearest_neighbors.kneighbors([expected_capital_vector])
    return df_vec.loc[idx[0][0]]

In [145]:
# Print country capitals for the following list
countries = ["Russia", "Tuvalu", "Honduras", "Egypt"]

for country in countries:
    print(country, get_capital(country, df_vec))

Russia x        -39.324348
y       -176.132294
z        -59.247547
label        Moscow
Name: 35, dtype: object
Tuvalu x         -92.35936
y          19.34063
z        147.291367
label        Tuvalu
Name: 222, dtype: object
Honduras x        16.052261
y        67.986992
z         11.28756
label     Honduras
Name: 112, dtype: object
Egypt x         1.728504
y       -73.493057
z        88.992996
label        Cairo
Name: 77, dtype: object
