# Sentiment Analysis con SVM
Vamos a realizar un análisis de sentimiento sobre tweets utilizando un modelo Bag of Words con máquina de vectores de soporte.

Usaremos el dataset de Twitter de [NLTK](http://www.nltk.org) para preprocesar los tweets.

Probaremos a contar tokens con CountVectorizer de [Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) y a calcular coeficientes TF-IDF con TfidfVectorizer de [Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).

Finalmente constuiremos un modelo de máquina de vectores de soporte y lo utilizaremos para hacer análsis de sentimiento sobre los tweets.

## Dataset

In [3]:
import nltk
from nltk.corpus import twitter_samples
import numpy as np

In [4]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Marti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.


True

In [5]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
tweets = all_positive_tweets + all_negative_tweets
labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets))))

## Preprocesamiento

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X = vectorizer.fit_transform(tweets[:10])
vectorizer.get_feature_names_out()

array(['02392441234', '15', '97sides', 'able', 'accnt', 'act', 'am',
       'amazing', 'an', 'and', 'app', 'are', 'as', 'assist', 'bayan',
       'be', 'being', 'bhaktisbanter', 'bleed', 'blue', 'but', 'buying',
       'bye', 'call', 'calling', 'centre', 'co', 'community', 'congrats',
       'contact', 'customers', 'days', 'dd', 'despiteofficial', 'don',
       'ebz0l2venm', 'engaged', 'enjoy', 'enough', 'entering', 'etl',
       'fb', 'flipkartfashionfriday', 'followfriday', 'for',
       'france_inte', 'friday', 'go', 'got', 'had', 'happy', 'has',
       'have', 'hey', 'hope', 'house', 'how', 'http', 'https',
       'impatientraider', 'implies', 'in', 'irresistible', 'is', 'james',
       'jgh', 'just', 'katamari', 'keep', 'lamb2ja', 'last', 'layer',
       'like', 'listen', 'long', 'lovely', 'lwwf', 'many', 'mark',
       'members', 'milipol_paris', 'mischievousness', 'must', 'my',
       'name', 'new', 'night', 'not', 'odd', 'of', 'on', 'one', 'our',
       'pallaviruhail', 'pkuchl

In [11]:
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 2]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tweets[:10])
vectorizer.get_feature_names_out()
print(X.shape)

(10, 126)


In [14]:
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.2271999 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.16897527],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.368937  ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Entrenamiento y evaluación del modelo

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [16]:
X = tweets
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [17]:
estimator = Pipeline([("cv", CountVectorizer(ngram_range=(1, 2))), ("svm", SVC())])
estimator.fit(X_train, y_train)
score = estimator.score(X_test, y_test)
print(score)

0.7825


In [18]:
estimator = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1, 2))), ("svm", SVC())])
estimator.fit(X_train, y_train)
score = estimator.score(X_test, y_test)
print(score)

0.804


## Con el preprocesamiento completo (con stemming)

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import numpy as np
import re
import string

In [20]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
def process_tweet(tweet, lang='english'):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
        lang: language, default='english'
    Output:
        A list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words(lang)
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [22]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet (either 0 or 1)
    Output:
        A dictionary mapping each (word, sentiment) pair to its frequency
    """
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            freqs[(word, y)] = freqs.get((word, y), 0) + 1
    return freqs

In [23]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    '''
    Input: 
        tweet: a string containing one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        process_tweet: process tweet function. Default: process_tweet
    Output: 
        A feature vector of dimension (1, 2)
    '''
    word_l = process_tweet(tweet)
    x = np.zeros(2) 
    for word in word_l:
        x[0] += freqs.get((word, 1), 0)
        x[1] += freqs.get((word, 0), 0)
    x = x[None, :]
    return x

In [24]:
freqs = build_freqs(tweets, labels)
X = np.zeros((len(tweets), 2))
for i in range(len(tweets)):
    X[i, :]= extract_features(tweets[i], freqs)
y = labels

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
estimator = Pipeline([("std", StandardScaler()), ("svm", SVC())])
estimator.fit(X_train, y_train)
score = estimator.score(X_test, y_test)
print(score)

0.991
