# Experiment for Master Thesis: A Emperical Study of Online Sentiment Analysis on Twitter Streams
Dataset: 【Sentiment140】with 1.6 million labeled tweets

In [17]:
# necessary package
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Load Dataset

In [18]:
dataset = pd.read_csv('train_data.csv', encoding='latin-1')
dataset

Unnamed: 0,polarity,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
9993,4,Good Morning Tweople...
9994,4,really should sleep earlier...tonight's bed ti...
9995,4,@clyp1976 what you got??
9996,4,@hairdryerhewson Drink plenty of cranberry jui...


# Sentiment Analysis Step1： Data preprocessing

In [19]:
#clean the data
def clean(text, stem_words=True):
    import re    # for regular expressions
    from string import punctuation 
    from nltk.stem import SnowballStemmer    
    from nltk.corpus import stopwords      #if you want to remove stopwords
    from nltk.tokenize import word_tokenize 
    #remove stopwords UNCOMMENT TO SEE IT'S IMPACT ON BAG OF WORDS
    #stop = re.compile(r'\b('+ r'|'.join(stopwords.words('english'))+r')\b\s*')
    #text = stop.sub('', text)
    
    if type(text) != str or text=='':
        return ''
    # Clean the text (here i have 2-3 cases of pre-processing by sampling the data. You might need more)
    text = text.lower() #change to lower case
    text = re.sub("@\w+ ","", text) #removes all usernames in text
    text = re.sub("\'s", " ", text) 
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub('[^a-zA-Z]', ' ', text)  #remove numbers
    text = re.sub(r'\s+', ' ', text)      
    text = re.sub(r'[^\w\s]','',text)     #remove commas
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text) # remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub("[!~#$+%*:()'?-]", ' ', text)   # remove characters stated below
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    text = re.sub("[!~#$+%*:()'?-]", ' ', text) 
       
    ##Return a list of words
    return text

In [20]:
X = dataset['tweet']

In [1]:
import sys
sys.path

['/usr/lib/python36.zip',
 '/usr/lib/python3.6',
 '/usr/lib/python3.6/lib-dynload',
 '',
 '/home/huilin/.local/lib/python3.6/site-packages',
 '/usr/local/lib/python3.6/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/home/huilin/.local/lib/python3.6/site-packages/IPython/extensions',
 '/home/huilin/.ipython']

In [21]:
corpus = []

import re
for i in range(0,9998):
    clean_text = re.sub(r'\W', ' ', str(X[i]))
    clean_text = re.sub(r'^br$', ' ', clean_text)
    clean_text = re.sub(r'\s+^br$\s+', ' ', clean_text)
    clean_text = re.sub(r'\s+[a-z]\s+', ' ', clean_text)
    clean_text = re.sub(r'^b\s+', ' ', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text) 
    clean_text = clean_text.lower()    
    corpus.append(clean_text)    
corpus[1]

' kenichan i dived many times for the ball managed to save 50 the rest go out of bounds'

# Sentiment Analysis Step2： Text Vectorization

In [22]:
y = dataset['polarity'].values
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
countvector    = CountVectorizer() 
tfidftransform = TfidfTransformer()
X = countvector.fit_transform(corpus).toarray() 
X = tfidftransform.fit_transform(X).toarray()
type(X)

numpy.ndarray

[0.0, 0.0, 0.0, Ellipsis, 0.0, 0.0, 0.0]

# Sentiment Analysis Step3： Model training

In [23]:
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(X,y, test_size=0.2, random_state = 0)
X_test_text[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [9]:
sourcedata = []
for i in range(len(X_train_text)):
    sourcedata.append((list(X_train_text[i]),int(y_train_text[i])))

In [9]:
with open("traindata", 'w') as abc:  #写入numpy.ndarray数据
    np.savetxt(abc, X_train_text, delimiter="\n")     #使用numpy.savetxt()写入数据，Data为要存的变量因为numpy.ndarray数                                    #据无法用write()写入,数据间用'，'相隔。


In [None]:
# 1. online learning strategy

In [71]:
import time
def stream(Xdata,Ydata,Xtest,Ytest,model):
    #time.sleep(2)
    begin = time.time()
    acc = []
    cal_time = []
    model.fit(Xdata[:300],Ydata[:300])
    y_initial = model.predict(X_test_text)
    #print(f"The initial accuracy of model {str(model)}  is: ", accuracy_score(y_initial,y_test_text))
    #print('===================================================')
    for i in range(1,1201):
        model.partial_fit(Xdata[300+(i-1)*100:300+i*100],Ydata[300+(i-1)*100:300+i*100])
        y_pred = model.predict(X_test_text)
        ACC = float(accuracy_score(y_pred,Ytest))
        end = time.time()
        cal_time.append(round(end-begin,2))
        acc.append(ACC)
    return acc,cal_time

In [72]:
# 2. offline learning strategy

In [73]:
import time
def batch(Xdata,Ydata,Xtest,Ytest,model):
    begin = time.time()
    ACC = []
    cal_time = []
    for i in range(0,1201):
        #time.sleep(2)
        model.fit(Xdata[:300+i*100],Ydata[:300+i*100])
        #print(f'offline SA calculation time: {end-begin} seconds')
        y_pred = model.predict(X_test_text)
        ac = float(accuracy_score(y_pred,Ytest))
        end = time.time()
        cal_time.append(round(end-begin,2))
        ACC.append(ac)
    return ACC,cal_time

In [25]:
# choose model
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
BNB = BernoulliNB()

In [26]:
BNB.fit(X_train_text[:100],y_train_text[:100])

BernoulliNB()

In [28]:
BNB.partial_fit([X_train_text[101]],[y_train_text[101]])

BernoulliNB()

In [32]:
X_train_text.shape

(7998, 17926)

In [29]:
import pickle
import redis
import logging
import _pickle as cPickle

r = redis.StrictRedis(host='localhost', port=6379, db=0)
try:
    r.set('osamodel', pickle.dumps(BNB, protocol=pickle.HIGHEST_PROTOCOL))
except (redis.exceptions.RedisError, TypeError, Exception):
    logging.warning('无法连接 Redis 以存储模型数据')


In [30]:
try:
    called_model = pickle.loads(r.get('osamodel'))
except TypeError:
    logging.info('Redis 内没有指定名称的模型，因此初始化一个新模型')
except (redis.exceptions.RedisError, TypeError, Exception):
    logging.warning('Redis 出现异常，因此初始化一个新模型')
finally:
    called_model = called_model or BernoulliNB

In [31]:
y_pre = called_model.predict(X_test_text[:500])
acc = float(accuracy_score(y_pre,y_test_text[:500]))
acc

0.73

In [53]:
def test(a,b):
    return (a,b)
type(test(1,2))

tuple

In [75]:
print('==========Experiment with 120,000 tweets===========')
print('Every second comes 100 tweets')
print('Algorithm: Bernoulli Naive Bayes')
print('==========   Experiment results  ===========')
start = time.time()
stream(X_train_text,y_train_text,X_test_text,y_test_text,BNB)
end = time.time()
print(f'Online Sentiment Analysis spent: {end-start} seconds')
start = time.time()
batch(X_train_text,y_train_text,X_test_text,y_test_text,BNB)
end = time.time()
print(f'Offline Sentiment Analysis spent: {end-start} seconds')

Every second comes 100 tweets
Algorithm: Bernoulli Naive Bayes


ValueError: Found array with 0 sample(s) (shape=(0, 17926)) while a minimum of 1 is required.