# Sentiment Analysis

### Supervised Learning

In [1]:
import os
import re
import string
from unidecode import unidecode
import pandas as pd
import torch
import sys

In [2]:
data_path = os.path.join('dati', 'betsentiment-IT-tweets-sentiment-players.csv')
dati = pd.read_csv(data_path, encoding='latin-1')

In [3]:
dati.columns

Index(['tweet_date_created', 'tweet_id', 'tweet_text', 'language', 'sentiment',
       'sentiment_score'],
      dtype='object')

In [4]:
dati.head()

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score
0,2018-12-10T10:45:40.865000,1072079899224100865,"@juventusfc @G_Higuain Auguri pipita,sempre co...",it,NEUTRAL,"{""Neutral"":0.858726024627685546875,""Negative"":..."
1,2018-06-30T18:41:02.817000,1013130303454498817,@realvarriale @massimozampini @KMbappe @G_Higu...,it,NEUTRAL,"{""Neutral"":0.973993778228759765625,""Negative"":..."
2,2018-12-26T19:40:44.308000,1078012758069858308,@OfficialASRoma @D_10Perotti @Hyundai_Italia S...,it,NEUTRAL,"{""Neutral"":0.9844334125518798828125,""Negative""..."
3,2018-07-12T11:26:37.766000,1017369635816181766,@CarloRienzi @Cristiano @juventusfc Populismo ...,it,NEUTRAL,"{""Neutral"":0.993849217891693115234375,""Negativ..."
4,2018-06-12T19:20:36,1006617282779451392,@OfficialASRoma @OfficialRadja @leonsfdo non c...,it,NEUTRAL,"{""Neutral"":0.87245881557464599609375,""Negative..."


In [5]:
dati['sentiment_score'][0]

'{"Neutral":0.858726024627685546875,"Negative":0.00167082459665834903717041015625,"Positive":0.13944680988788604736328125,"Mixed":0.000156317240907810628414154052734375}'

In [6]:
dati['sentiment']

0         NEUTRAL
1         NEUTRAL
2         NEUTRAL
3         NEUTRAL
4         NEUTRAL
           ...   
165810    NEUTRAL
165811    NEUTRAL
165812    NEUTRAL
165813    NEUTRAL
165814    NEUTRAL
Name: sentiment, Length: 165815, dtype: object

In [7]:
set(dati['sentiment'])

{'MIXED', 'NEGATIVE', 'NEUTRAL', 'POSITIVE'}

In [8]:
dati['tweet_text'][0]

'@juventusfc @G_Higuain Auguri pipita,sempre con noi'

In [9]:
found = re.search('[a-z]', dati['tweet_text'][0])

In [10]:
if found:
    print(found.group(0))

j


In [11]:
print('b\n', r'b\n')

b
 b\n


In [12]:
# Proviamo solo minuscole per ora, per efficienza.
# Essendo tweet, tutto maiuscolo potrebbe essere significativo.
clean_text = re.sub(r'[^a-zàáèéíìòóùú0-9]', ' ', dati['tweet_text'][4876].lower())
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

In [13]:
clean_text

'narkos92 g apicella vannaboba sscnapoli ecavaniofficial ieri era fatta oggi si deve ridursi l ingaggio bah la vedo strana come cosa'

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
re.sub(r'[^a-z]', ' ', 'à')

' '

In [16]:
for idx, tweet in enumerate(dati['tweet_text']):
    if any(char in tweet for char in 'àáèéíìòóùú'):
        print(idx, tweet)
        break

4 @OfficialASRoma @OfficialRadja @leonsfdo non ce posso pensa non giochiamo #Radjanonsitocca non facciamo paragoni con altri giocatori andati lui per Roma è un altra cosa https://t.co/rgscxanuqh


In [17]:
clean_text = re.sub(r'@[a-z0-9]+\b', ' ', dati['tweet_text'][4].lower())
clean_text = re.sub(r'[^a-zàáèéíìòóùú0-9]', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
clean_text = unidecode(clean_text)

In [18]:
clean_text

'non ce posso pensa non giochiamo radjanonsitocca non facciamo paragoni con altri giocatori andati lui per roma e un altra cosa https t co rgscxanuqh'

In [19]:
def clean_text(text: str):
    clean_text = re.sub(r'@[a-z0-9]+\b', ' ', text.lower())
    clean_text = re.sub(r'[^a-zàáèéíìòóùú0-9]', ' ', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    clean_text = unidecode(clean_text)
    return clean_text

### Tokenisation

In [20]:
vocab = {'0': 0}
for tweet in dati['tweet_text']:
    pulito = clean_text(tweet).split()
    for word in pulito:
        if word not in vocab:
            vocab[word] = len(vocab)

In [21]:
vocab

{'0': 0,
 'g': 1,
 'higuain': 2,
 'auguri': 3,
 'pipita': 4,
 'sempre': 5,
 'con': 6,
 'noi': 7,
 'paudybala': 8,
 'jr': 9,
 'nano': 10,
 'puttano': 11,
 'd': 12,
 '10perotti': 13,
 'hyundai': 14,
 'italia': 15,
 'stasera': 16,
 'bravissimi': 17,
 'tutti': 18,
 'una': 19,
 'menzione': 20,
 'per': 21,
 'il': 22,
 'capitano': 23,
 'lucido': 24,
 'ed': 25,
 'onnipresente': 26,
 'populismo': 27,
 'di': 28,
 'basso': 29,
 'spessore': 30,
 'non': 31,
 'ce': 32,
 'posso': 33,
 'pensa': 34,
 'giochiamo': 35,
 'radjanonsitocca': 36,
 'facciamo': 37,
 'paragoni': 38,
 'altri': 39,
 'giocatori': 40,
 'andati': 41,
 'lui': 42,
 'roma': 43,
 'e': 44,
 'un': 45,
 'altra': 46,
 'cosa': 47,
 'https': 48,
 't': 49,
 'co': 50,
 'rgscxanuqh': 51,
 'stop': 52,
 'altrimenti': 53,
 'tanto': 54,
 'vale': 55,
 'prendere': 56,
 'immobikeno': 57,
 'belotti': 58,
 'gli': 59,
 '50': 60,
 'dove': 61,
 'li': 62,
 'prendi': 63,
 'sei': 64,
 'cane': 65,
 'piccola': 66,
 'cagna': 67,
 'spaventata': 68,
 'vorrei': 69,


### One-hot encoding

In [22]:
clean_text = re.sub(r'@[a-z0-9]+\b', ' ', dati['tweet_text'][4].lower())
clean_text = re.sub(r'[^a-zàáèéíìòóùú0-9]', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
clean_text = unidecode(clean_text)

In [23]:
clean_text

'non ce posso pensa non giochiamo radjanonsitocca non facciamo paragoni con altri giocatori andati lui per roma e un altra cosa https t co rgscxanuqh'

In [24]:
splitted = clean_text.split()
token_text = []
for word in splitted:
    token_text.append(vocab[word])

In [25]:
vocab['terrapiatta']

KeyError: 'terrapiatta'

In [28]:
vocab.get('terrapiatta') == None

True

In [29]:
splitted = clean_text.split()
token_text = []
for word in splitted:
    token_text.append(vocab.get(word, 0))

In [30]:
' '.join(str(token) for token in token_text)

'31 32 33 34 31 35 36 31 37 38 6 39 40 41 42 21 43 44 45 46 47 48 49 50 51'

In [31]:
token_text = [vocab.get(word, 0) for word in clean_text.split()]

In [32]:
' '.join(str(token) for token in token_text)

'31 32 33 34 31 35 36 31 37 38 6 39 40 41 42 21 43 44 45 46 47 48 49 50 51'

In [33]:
torch_text = torch.zeros((len(token_text), len(vocab)))

In [34]:
torch_text.shape

torch.Size([25, 111053])

In [35]:
for row, token in zip(torch_text, token_text):
    row[token] = 1.0

In [36]:
torch_text[0, 30:40]

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [37]:
testi = ['Il gatto e nero', 'Il cane mangia','Davide mangia il porco con il pane']

In [38]:
v = {}
for testo in testi:
    for parola in testo.split():
        if parola not in v:
            v.update({parola: len(v)})

In [39]:
v

{'Il': 0,
 'gatto': 1,
 'e': 2,
 'nero': 3,
 'cane': 4,
 'mangia': 5,
 'Davide': 6,
 'il': 7,
 'porco': 8,
 'con': 9,
 'pane': 10}

In [40]:
token_testo = [[v[parola] for parola in testo.split()] for testo in testi]

In [41]:
token_testo

[[0, 1, 2, 3], [0, 4, 5], [6, 5, 7, 8, 9, 7, 10]]

In [42]:
torch_testo = torch.zeros((len(token_testo[0]), len(v)))
torch_testo

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [43]:
for riga, token in zip(torch_testo, token_testo[0]):
    riga[token] = 1.0
torch_testo

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

In [44]:
sys.getsizeof(torch_text.storage())

11105356

### Bag of Words

Invece di usare un indice per ogni parola, si usa il conteggio della parola nel testo. Esempio <a>https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=countvectorizer#sklearn.feature_extraction.text.CountVectorizer</a>

### Embedding

In [45]:
torch.nn.Embedding

torch.nn.modules.sparse.Embedding