### Ass01: Logistic Regression on sentiment analysis



In [1]:
### Import functions and data

import nltk
from os import getcwd
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import twitter_samples
import numpy as np

%matplotlib Inline
%config InlineBackend.figure_format='svg'

# download data
nltk.download('twitter_samples')
nltk.download('stopwords')

# prepare the data 
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data for test and trianing
train_pos=all_positive_tweets[:4000]
test_pos=all_positive_tweets[4000:]
train_neg=all_negative_tweets[:4000]
test_neg=all_negative_tweets[4000:]

train_x=train_pos+train_neg
test_x=test_pos+test_neg

# create numpy array
train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis=0)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis=0)
print("train_y.shape: " + str(train_y.shape))
print("test_y.shape: " + str(test_y.shape))

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\JZ\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JZ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


train_y.shape: (8000, 1)
test_y.shape: (2000, 1)


In [2]:
### create frequency dictionary

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, 
                              reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    # remove stopwords
    # remove punctuation
    for word in tweet_tokens:
        if word not in stopwords_english and\
            word not in string.punctuation:
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [3]:
def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
                
    return freqs
                    

In [4]:
freqs = build_freqs(train_x, train_y)

print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11339


In [5]:
### apply function process_tweet (remove stop words and applies stemming)


# test the function below
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))


This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


### Part1: logistic regression 

In [6]:
### sigmoid

def sigmoid(z):
    return 1/(1+np.exp(-z))

In [7]:
### gradient descent function

def gradient(x,y,theta,alpha,num_iters):
    m=x.shape[0]
    
    for i in range(num_iters):
        z=np.dot(x,theta)
        h=sigmoid(z)
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))                                                    
        theta=theta-(alpha/m)*np.dot(x.transpose(),(h-y))
        
    J=float(J)
    
    return J,theta

In [8]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradient(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


### Part2: Extracting the features

In [9]:
def extract_features (tweet, freqs):
    word_l = process_tweet(tweet)
    x = np.zeros((1,3))
    
    x[0,0] = 1
    
    for word in word_l:
        x[0,1] += freqs.get((word, 1.),0)
        x[0,2] += freqs.get((word, 0.),0)
        
    assert(x.shape == (1,3))
    
    return x

In [10]:
# check the functions
tmpl = extract_features(train_x[0], freqs)
print(tmpl)

[[1.00e+00 3.02e+03 6.10e+01]]


In [11]:
# Check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


### Part3:  Training your model 

In [12]:
X=np.zeros((len(train_x),3))
for i in range(len(train_x)):
    X[i,:]=extract_features(train_x[i],freqs)
    
Y=train_y

J,theta=gradient(X,Y,np.zeros((3,1)),1e-9,1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24215613.
The resulting vector of weights is [7e-08, 0.00052391, -0.00055517]


### Part4: Test the logistic regression

In [13]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [14]:
def sentiment(x):
    return str('Good') if  x > 0.5 else str('Bed')

In [15]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.',\
              'great', 'not bad', 'worse', 'no']:
    print( '{} -> {} -> {}' .format(tweet,predict_tweet(tweet,freqs,theta)[0][0] ,sentiment(predict_tweet(tweet, freqs, theta)[0][0])))

I am happy -> 0.5185805809400451 -> Good
I am bad -> 0.49433913020077935 -> Bed
this movie should have been great. -> 0.5153314567145229 -> Good
great -> 0.5154641905857459 -> Good
not bad -> 0.49433913020077935 -> Bed
worse -> 0.49915944626344694 -> Bed
no -> 0.500000018129435 -> Good


In [16]:
### check performance using the test set

def test_lr(test_x, test_y, freqs, theta):
    y_hat = []
    
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        y_hat.append(1) if y_pred > 0.5 else y_hat.append(0)
        
    accuracy = (y_hat == np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy
        

In [17]:
tmp_accuracy = test_lr(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


### Part5: Error Analysis

In [18]:
# Some error analysis done for you
print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Label Predicted Tweet
THE TWEET IS: @jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
THE PROCESSED TWEET IS: ['truli', 'later', 'move', 'know', 'queen', 'bee', 'upward', 'bound', 'movingonup']
1	0.49996933	b'truli later move know queen bee upward bound movingonup'
THE TWEET IS: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
THE PROCESSED TWEET IS: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48663882	b'sure would good thing 4 bottom dare 2 say 2 miss b im gonna stubborn mouth soap nothavingit :p'
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370724	b"i'm play brain dot braindot"
THE TWEET IS: I'm p

In [19]:
# the meaning of the sentence is contextual 

### Part6: Predict with your own tweet

In [25]:
# Feel free to change the tweet below
my_tweet = 'I cried, but anyway, I laughed in the end. So this is a good moive!'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['cri', 'anyway', 'laugh', 'end', 'good', 'moiv']
[[0.50611339]]
Positive sentiment


In [8]:
### Another try out

from nltk.stem import PorterStemmer

words = ['health']

def process_word(words):
    stemmer = PorterStemmer()
    
    word_clean = []
    for word in words:
        stem_word = stemmer.stem(word)
        word_clean.append(stem_word)
        
    return word_clean

In [9]:
print(process_word(words))

['health']


In [5]:
conda install pyahocorasick

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.

Note: you may need to restart the kernel to use updated packages.



PackagesNotFoundError: The following packages are not available from current channels:

  - pyahocorasick

Current channels:

  - https://repo.anaconda.com/pkgs/main/win-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/r/win-64
  - https://repo.anaconda.com/pkgs/r/noarch
  - https://repo.anaconda.com/pkgs/msys2/win-64
  - https://repo.anaconda.com/pkgs/msys2/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




In [6]:
pip install pyahocorasick


Collecting pyahocorasick
  Using cached pyahocorasick-1.4.2.tar.gz (321 kB)
Building wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py): started
  Building wheel for pyahocorasick (setup.py): finished with status 'done'
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp38-cp38-win_amd64.whl size=38927 sha256=9fad550834f1098edf3e96683c9696bd29b7923924c5c650fd798e0a10a1c884
  Stored in directory: c:\users\jz\appdata\local\pip\cache\wheels\74\bc\b8\e5f739a84005620cfe66d3fcb8bb182e309d6056bc6700b60e
Successfully built pyahocorasick
Installing collected packages: pyahocorasick
Successfully installed pyahocorasick-1.4.2
Note: you may need to restart the kernel to use updated packages.
