In [1]:
# importing necessary libraries

import nltk, re, string
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pickle

# Data preprocessing

In [2]:


def process_tweet(tweet):
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and
                word not in string.punctuation):
            stem_word = stemmer.stem(word)  
            tweets_clean.append(stem_word)

    return tweets_clean


# Créer la fonction fréquence

On va définir une fonction pour avoir la fréquence d'un  ou plusieurs mots dans le data set. Cette étape est cruciale car le modèle va être entraîné sur les données qui en ressortiront.

In [3]:
def build_freqs(tweets, ys):
   
    yslist = np.squeeze(ys).tolist()

   
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Ci dessous un exemple de comment la fonction marche

In [5]:


tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
res = build_freqs(tweets, ys)
print(res)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [6]:
import nltk
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

On déclare le data set dans deux variables : une négative et une positive

In [7]:

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [8]:

test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

In [9]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [10]:

train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [11]:

freqs = build_freqs(train_x, train_y)

In [12]:
# check out the output

print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11337


In [13]:
# Ceci est le test de la fonction process qui permet de faire le stemming et la tokenization

print('This is an example of a positive tweet: \n', train_x[22])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[22]))

This is an example of a positive tweet: 
 @gculloty87 Yeah I suppose she was lol! Chat in a bit just off out x :))

This is an example of the processed version of the tweet: 
 ['yeah', 'suppos', 'lol', 'chat', 'bit', 'x', ':)']


In [14]:

def sigmoid(z):
  
    zz = np.negative(z)
    h = 1 / (1 + np.exp(zz))
    return h

In [15]:

def gradientDescent(x, y, theta, alpha, num_iters):
  
    
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
     #Calcul de la cost function
        cost = -1. / m * (np.dot(y.transpose(), np.log(h)) + np.dot((1 - y).transpose(), np.log(1 - h)))
        
        theta = theta - (alpha / m) * np.dot(x.transpose(), (h - y))

    cost = float(cost)
    return cost, theta


In [16]:


def extract_features(tweet, freqs):
  

    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))

    x[0, 0] = 1

    for word in word_l:
       
        x[0, 1] += freqs.get((word, 1.0), 0)
       
        x[0, 2] += freqs.get((word, 0.0), 0)

    assert (x.shape == (1, 3))
    return x

In [17]:


tmp1 = extract_features(train_x[22], freqs)
print(tmp1)

[[1.000e+00 3.006e+03 1.240e+02]]


In [18]:

X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs)


Y = train_y

J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

In [19]:
def predict_tweet(tweet, freqs, theta):

    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x, theta))

    return y_pred


In [20]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    
    y_hat = []

    for tweet in test_x:
   
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)

    accuracy = (y_hat == np.squeeze(test_y)).sum() / len(test_x)

    return accuracy

In [21]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [22]:


def pre(sentence):
    yhat = predict_tweet(sentence, freqs, theta)
    if yhat > 0.5:
        return 'Positive sentiment'
    elif yhat == 0:
        return 'Neutral sentiment'
    else:
        return 'Negative sentiment'

In [23]:
my_tweet = 'It is so hot today but it is the perfect day for a beach party'

res = pre(my_tweet)
print(res)

Positive sentiment
