# NORMALIZACJA, TOKENIZACJA I KODOWANIE TEKSTU

<img src="tokenization.jpg"/>
źródłó grafiki: https://www.shutterstock.com/pl/image-photo/tokenization-word-concept-on-building-blocks-2327475429
autor: SergioVas

źródło zbioru danych: https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset

### Pobranie bibliotek i datasetu

In [None]:
# import bibliotek
import pandas as pd
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')
# import zbioru danych z tweetami
train = pd.read_csv('Tweets.csv')
train.dropna()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [None]:
# wyświetl pierwszych 10 rezultatów
train.head(10)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both of you,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive


## Analiza danych tekstowych

### Sprawdź liczbę słów i liter

In [None]:
# oblicz liczbę słów w każdym tekście
train['word_count'] = train['text'].apply(lambda x: len(str(x).split(" ")))
print(train[['text','word_count']].head())
print(f'Mean: {train.word_count.mean()}')

                                                text  word_count
0                I`d have responded, if I were going           8
1      Sooo SAD I will miss you here in San Diego!!!          11
2                          my boss is bullying me...           5
3                     what interview! leave me alone           6
4   Sons of ****, why couldn`t they put them on t...          15
Mean: 13.7794476183545


In [None]:
#oblicz liczbę liter w każdym tekście
train['char_count'] = train['text'].str.len()
print(train[['text','char_count']].head())
print(f'Mean: {train.char_count.mean()}')

                                                text  char_count
0                I`d have responded, if I were going        36.0
1      Sooo SAD I will miss you here in San Diego!!!        46.0
2                          my boss is bullying me...        25.0
3                     what interview! leave me alone        31.0
4   Sons of ****, why couldn`t they put them on t...        75.0
Mean: 68.33002183406114


In [None]:
# średnia ilość liter w słowie
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['text'] = train['text'].astype('str')
train['avg_word'] = train['text'].apply(lambda x: avg_word(x))
print(train[['text','avg_word']].head())
print(f'Mean: {train.avg_word.mean()}')

                                                text  avg_word
0                I`d have responded, if I were going  4.142857
1      Sooo SAD I will miss you here in San Diego!!!  3.600000
2                          my boss is bullying me...  4.200000
3                     what interview! leave me alone  5.200000
4   Sons of ****, why couldn`t they put them on t...  4.357143
Mean: 4.464102382945475


### Stop-słowa

In [None]:
nltk.download('stopwords')
stop = stopwords.words('english')

print(stop[:10])

train['stopwords'] = train['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
print(train[['text','stopwords']].head())
print(f'Mean: {train.stopwords.mean()}')

[nltk_data] Downloading package stopwords to /home/michal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
                                                text  stopwords
0                I`d have responded, if I were going          3
1      Sooo SAD I will miss you here in San Diego!!!          4
2                          my boss is bullying me...          2
3                     what interview! leave me alone          2
4   Sons of ****, why couldn`t they put them on t...          7
Mean: 4.358829736909137


### Hashtagi

In [None]:
train['hashtags'] = train['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
print(train[['text','hashtags']].iloc[166:].head(20))
print(f'Mean: {train.hashtags.mean()}')

                                                  text  hashtags
166    #lichfield #tweetup sounds like fun  Hope to...         2
167  Big booming thunder storm almost here.  Maybe ...         0
168      Few Bevvies 2day in twn..great on a day off!!         0
169  first night in myers. just not the same w/out ...         0
170                                       good morning         0
171                            its the best show EVER!         0
172  URL in previous post (to timer job) should be ...         0
173  i think iv hurt my tooth  and eilish and cassi...         0
174   I want to know when the auditions are Mander!...         0
175   or even NOOOOO NOT THE SECRET NAMEREBECCA PLEASE         0
176   I miss my neice  can`t wait to see her bad n ...         0
177                    i need to get my computer fixed         0
178  really hopes her car`s illness is not terminal...         0
179  All the cool people I want to find for followi...         1
180   no sir...i woulda p

### Dane numeryczne

In [None]:
train['numerics'] = train['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
print(train[['text','numerics']].iloc[183:].head(20))
print(f'Mean: {train.numerics.mean()}')

                                                  text  numerics
183         I`m sad that I missed you guys last night!         0
184  Finally got a call for marriage counseling 3 d...         1
185                                            ok then         0
186                                     _420 why baby?         0
187  today was the last day of high school for me a...         0
188  We`re having an impromptu pool party... Except...         0
189   lost my tooth 2day whilst i was eating gum...oww         0
190                                   happy 1 year! <3         1
191  Oh, I HELLA forgot to say my official good mor...         0
192   *phew*  Will make a note in case anyone else ...         0
193    WHAT ABOUT ME ??  I VOTE EVERY DAY FOR YOU !...         0
194  I`m starving!! This diet is killing me but I c...         0
195                                      i talk to you         0
196  im soo bored...im deffo missing my music channels         0
197           nite nite b

### Wyrazy zaczynające się wielka literą

In [None]:
train['upper'] = train['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['text','upper']].head()

Unnamed: 0,text,upper
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,2
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0


Możemy przeanalizować także części mowy, części zdania, nazwane encje oraz wzajemne zależnosci między słowami w zdaniu.

In [None]:
import nltk
import textblob
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger')
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

train['noun_count'] = train['text'].apply(lambda x: check_pos_tag(x, 'noun'))
train['verb_count'] = train['text'].apply(lambda x: check_pos_tag(x, 'verb'))
train['adj_count'] = train['text'].apply(lambda x: check_pos_tag(x, 'adj'))
train['adv_count'] = train['text'].apply(lambda x: check_pos_tag(x, 'adv'))
train['pron_count'] = train['text'].apply(lambda x: check_pos_tag(x, 'pron'))

## Normalizacja danych tekstowych

### Ujednolicenie pisowni - zmiana na małe litery

In [None]:
train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['text'].head()

0                  i`d have responded, if i were going
1        sooo sad i will miss you here in san diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4    sons of ****, why couldn`t they put them on th...
Name: text, dtype: object

### Usunięcie znaków specjalnych oraz liczb

In [None]:
train['text'] = train['text'].str.replace('[^\w\s]','')
train['text'].head()

0                    id have responded if i were going
1           sooo sad i will miss you here in san diego
2                               my boss is bullying me
3                        what interview leave me alone
4    sons of  why couldnt they put them on the rele...
Name: text, dtype: object

## Usunięcie stop-słów

In [None]:
stop = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['text'].head()

0                          id responded going
1                     sooo sad miss san diego
2                               boss bullying
3                       interview leave alone
4    sons couldnt put releases already bought
Name: text, dtype: object

### Usunięcie często występujacych słów

In [None]:
freq = pd.Series(' '.join(train['text']).split()).value_counts()
freq = freq[freq > 500]
print(freq[:10])

im       3024
day      2044
good     1549
get      1426
like     1346
go       1266
dont     1200
love     1122
work     1112
going    1096
dtype: int64


In [None]:
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['text'].head()

0                                id responded
1                          sooo sad san diego
2                               boss bullying
3                       interview leave alone
4    sons couldnt put releases already bought
Name: text, dtype: object

### Usunięcie rzadko występujacych słów

In [None]:
freq = pd.Series(' '.join(train['text']).split()).value_counts()
freq = freq[freq < 5]
print(freq[:10])

nano           4
newest         4
blocking       4
nonetheless    4
argument       4
srry           4
alike          4
invitation     4
930            4
jays           4
dtype: int64


In [None]:
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['text'].head()

0                                 id
1                 sooo sad san diego
2                               boss
3              interview leave alone
4    sons couldnt put already bought
Name: text, dtype: object

### Stemming - usunięcie końcówek dla różnych form

In [None]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                                id
1                sooo sad san diego
2                              boss
3               interview leav alon
4    son couldnt put alreadi bought
Name: text, dtype: object

### Lematyzacja - zamiana słów ich podstawową formą

In [None]:
from textblob import Word
nltk.download('omw-1.4')
nltk.download('wordnet')
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['text'].head()

[nltk_data] Downloading package omw-1.4 to /home/michal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/michal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0                                id
1                sooo sad san diego
2                               bos
3             interview leave alone
4    son couldnt put already bought
Name: text, dtype: object

## Tokenizacja tekstu (kodowanie do postaci numerycznej)

### N-gramy

In [None]:
TextBlob(train['text'][4]).ngrams(2)

[WordList(['son', 'couldnt']),
 WordList(['couldnt', 'put']),
 WordList(['put', 'already']),
 WordList(['already', 'bought'])]

### TF-IDF

In [None]:
#TF - ile razy słowo występuje w danym tekście
#IDF - logarytm dziesiętny z liczby wszystkich tekstów w zbiorze podzielonej przez liczbę tekstów w których wystepuje dane słowo
tf1 = (train['text'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,sooo,1
1,sad,1
2,i,1
3,will,1
4,miss,1
5,you,1
6,here,1
7,in,1
8,san,1
9,diego!!!,1


In [None]:
import numpy as np
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['text'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,sooo,1,4.791905
1,sad,1,3.935252
2,san,1,5.972755
3,diego,1,7.823355


In [None]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,sooo,1,4.791905,4.791905
1,sad,1,3.935252,3.935252
2,san,1,5.972755,5.972755
3,diego,1,7.823355,7.823355


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['text'])

train_vect

<27481x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 81354 stored elements in Compressed Sparse Row format>

### Worek słów

In [None]:
#utwórz słownik
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,2),analyzer = "word")
train_bow = bow.fit_transform(train['text'])
train_bow

<27481x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 91577 stored elements in Compressed Sparse Row format>

In [None]:
bow.vocabulary_.keys()

dict_keys(['id', 'sooo', 'sad', 'interview', 'leave', 'alone', 'son', 'couldnt', 'put', 'already', 'bought', 'best', 'baby', 'smile', 'soooo', 'high', 'wow', 'hehe', 'chance', 'never', 'gonna', 'cake', 'stuff', 'song', 'story', 'taylor', 'running', 'low', 'music', 'tonight', 'lost', 'voice', 'test', 'trying', 'sigh', 'ive', 'sick', 'past', 'day', 'hair', 'look', 'didnt', 'every', 'he', 'sorry', 'find', 'soon', 'playing', 'online', 'interesting', 'update', 'job', 'wait', 'cleaning', 'house', 'family', 'later', 'gotta', 'computer', 'thought', 'supposed', 'end', 'mean', 'bout', 'called', 'friday', 'free', 'app', 'ipod', 'way', 'internet', 'came', 'omg', 'havent', 'minute', 'went', 'sleep', 'power', 'cut', 'working', 'seen', 'make', 'hahaha', 'say', '10', 'funny', 'cute', 'kid', 'ahhh', 'slept', 'game', 'try', 'watch', 'tomorrow', 'though', 'play', 'tear', 'nyc', 'living', 'year', 'case', 'wonder', 'busy', 'coming', 'ton', 'stay', 'school', 'little', 'wine', 'ok', 'care', 'car', 'big', 'ho

In [None]:
len(bow.vocabulary_.keys())

1000

In [None]:
from scipy import sparse
text = train['text'][1]
print(text)
decoded = sparse.csr_matrix(bow.transform([text]))
print(decoded)

sooo sad san diego
  (0, 708)	1
  (0, 778)	1


### Word embedding

In [None]:
# przed uruchomieniem pobierz:
# http://nlp.stanford.edu/data/glove.6B.zip

from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [None]:
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [None]:
model['go']

array([-0.078894,  0.4616  ,  0.57779 , -0.71637 , -0.13121 ,  0.4186  ,
       -0.29156 ,  0.52006 ,  0.089986, -0.35062 ,  0.51755 ,  0.51998 ,
        0.15218 ,  0.41485 , -0.12377 , -0.37222 ,  0.0273  ,  0.75673 ,
       -0.8739  ,  0.58935 ,  0.46662 ,  0.62918 ,  0.092603, -0.012868,
       -0.015169,  0.25567 , -0.43025 , -0.77668 ,  0.71449 , -0.3834  ,
       -0.69638 ,  0.23522 ,  0.11396 ,  0.02778 ,  0.071357,  0.87409 ,
       -0.1281  ,  0.063576,  0.067867, -0.50181 , -0.28523 , -0.072536,
       -0.50738 , -0.6914  , -0.53579 , -0.11361 , -0.38234 , -0.12414 ,
        0.011214, -1.1622  ,  0.037057, -0.18495 ,  0.01416 ,  0.87193 ,
       -0.097309, -2.3565  , -0.14554 ,  0.28275 ,  2.0053  ,  0.23439 ,
       -0.38298 ,  0.69539 , -0.44916 , -0.094157,  0.90527 ,  0.65764 ,
        0.27628 ,  0.30688 , -0.57781 , -0.22987 , -0.083043, -0.57236 ,
       -0.299   , -0.81112 ,  0.039752, -0.05681 , -0.48879 , -0.18091 ,
       -0.28152 , -0.20559 ,  0.4932  , -0.033999, 

In [None]:
len(model['go'])

100

In [None]:
model['away']

array([-0.10379 , -0.014792,  0.59933 , -0.51316 , -0.036463,  0.6588  ,
       -0.57906 ,  0.17819 ,  0.23663 , -0.21384 ,  0.55339 ,  0.53597 ,
        0.041444,  0.16095 ,  0.017093, -0.37242 ,  0.017974,  0.39268 ,
       -0.23265 ,  0.1818  ,  0.66405 ,  0.98163 ,  0.42339 ,  0.030581,
        0.35015 ,  0.25519 , -0.71182 , -0.42184 ,  0.13068 , -0.47452 ,
       -0.08175 ,  0.1574  , -0.13262 ,  0.22679 , -0.16885 , -0.11122 ,
       -0.32272 , -0.020978, -0.43345 ,  0.172   , -0.67366 , -0.79052 ,
        0.10556 , -0.4219  , -0.12385 , -0.063486, -0.17843 ,  0.56359 ,
        0.16986 , -0.17804 ,  0.13956 , -0.20169 ,  0.078985,  1.4497  ,
        0.23556 , -2.6014  , -0.5286  , -0.11636 ,  1.7184  ,  0.33254 ,
        0.12136 ,  1.1602  , -0.2914  ,  0.47125 ,  0.41869 ,  0.35271 ,
        0.47869 , -0.042281, -0.18294 ,  0.1796  , -0.24431 , -0.34042 ,
        0.20337 , -0.93676 ,  0.013077,  0.080339, -0.36604 , -0.44005 ,
       -0.35393 ,  0.15907 ,  0.55807 ,  0.1492  , 

In [None]:
(model['go'] + model['away'])/2

array([-0.091342  ,  0.223404  ,  0.58856   , -0.614765  , -0.0838365 ,
        0.5387    , -0.43531   ,  0.349125  ,  0.163308  , -0.28223   ,
        0.53547   ,  0.52797496,  0.096812  ,  0.2879    , -0.0533385 ,
       -0.37232   ,  0.022637  ,  0.574705  , -0.553275  ,  0.385575  ,
        0.565335  ,  0.805405  ,  0.2579965 ,  0.0088565 ,  0.1674905 ,
        0.25543   , -0.571035  , -0.59926   ,  0.422585  , -0.42896   ,
       -0.389065  ,  0.19631   , -0.00933   ,  0.127285  , -0.0487465 ,
        0.381435  , -0.22540998,  0.021299  , -0.1827915 , -0.16490501,
       -0.47944498, -0.431528  , -0.20091   , -0.55665   , -0.32982   ,
       -0.088548  , -0.28038502,  0.219725  ,  0.090537  , -0.67012   ,
        0.0883085 , -0.19332   ,  0.0465725 ,  1.160815  ,  0.0691255 ,
       -2.47895   , -0.33707   ,  0.083195  ,  1.86185   ,  0.283465  ,
       -0.13081   ,  0.927795  , -0.37028   ,  0.1885465 ,  0.66198   ,
        0.505175  ,  0.37748498,  0.1322995 , -0.380375  , -0.02