In [117]:
################
## Backend Code to Classify Tweets
## Using Support Vector Machine Classification
################

In [None]:
## Save Classifier 
## http://scikit-learn.org/stable/modules/model_persistence.html

In [1]:
# Import all the necessary libararies
import pandas as pd
import numpy as np
import re
from string import punctuation
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import twitter



In [2]:
############
# Import Training Corpus
############
# Get the data-set
data_set = pd.read_csv("processedCorpus.csv", names=["Tweet", "Sentiment"])
# This is our independent variable 'X'- The tweet data
# Using Pandas library get the tweet text from the first column of csv file
# [row,column]
X = data_set.iloc[:, :-1]
# This is our dependent variable 'y'- Positive Negative
# Use Pandas library to get the tweet from the last column
y = data_set.iloc[:, -1]

In [3]:
# Convert the Categroical data (negative and positive) to numerical data 0 and 1
# Support Vector CLassifier needs floats to classify data, not strings.
# LabelEncoder does encodes the categorical data here two categories(negative, positive)
# to numerical data 0 and 1. The categorical data is converted into numerical data alphabetically.
# There are three common methods used here: fit, fit_transfrom, transform in this LabelEncoder class
# method: fit = > will not modify the data but will convert them into numerical value into memory.
# method: transform => will use numerical values from the memory to convert the text data to numerical data.
# method: fit_transform  => does both fit and transform
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [4]:
corpus = list(X["Tweet"]) # convert data to list

In [5]:
# Count vectorizer creates the bag of words model
# 1500 most common words is taken, takes in to account term frequency
cv = CountVectorizer(max_features=1000)

In [6]:
# All the text data is converted into bag of words model
# bag of words -> https://machinelearningmastery.com/gentle-introduction-bag-words-model/
# represents training data as the absence or occurance of words in feature vector, 1's and 0's
# Use sklearn's libaray to create a bag of words, term document matrix
# use numPy to convert bag of words to ndimensional array
X = cv.fit_transform(corpus).toarray()
# Scale the model. Transform data so that the mean value is 0, standard dev of 1
# fit -> Calculate mean and standard deviation
# transform -> use the values to scale the data as above
# fit_transform -> combine the two above steps
sc = StandardScaler()
X = sc.fit_transform(X)



In [7]:
# Splits ndimensional array into random train and test subsets.
# Split 80% to training, 20% for testing.
# X_train, Y_train, 80% of training data and their values.
# X_test, Y_test, remaining 20% of training data and their values.
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
# Support Vector Classification library, scikitlearn.
SVMClassifier=SVC()
# Train the classifier with trainingdata
SVMClassifier.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
# Predict whether the test data is positive or negative
Y_pred = SVMClassifier.predict(X_test)

In [10]:
# Compare prediction of the test data with the actual labels.
# Accuracy = amount that is correct/total amount
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred) 

0.6862745098039216

In [11]:
# Confusion matrix to analyse accuracy
# Actual 0,1 being columns
# Predicted 0,1 being rows
# See https://en.wikipedia.org/wiki/Confusion_matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_pred)

array([[658, 325],
       [299, 707]], dtype=int64)

In [13]:
# Test prediction
print("Predict The sentiment")
data = input("Enter your data to get the sentiment: ")
data = [data]
# Convert data to bag of words model, put that
# in ndimensional numPy array
array = cv.transform(data).toarray()

r = SVMClassifier.predict(array)
print(r)
print(data)

Predict The sentiment
Enter your data to get the sentiment: 'unhappy sad kill'
[1]
['unhappy sad kill']


In [43]:
#############
## Get Test Data
#############

In [14]:
# Twitter API keys are needed to access tweets - both from feature vector and 
# tweets for classification.
api = twitter.Api(consumer_key='6b9ebwaNU4DDa9G9xF1FhrZQt',
                 consumer_secret='2prgddykMj2b7b9zTeN78BBhdrgdaNxjtSyyoo8iNRzKAZhzMX',
                 access_token_key='817521154847969280-bc6J796tc0cRjlhigiRZIoQVIzeW2Hf',
                 access_token_secret='WAZy2gZ9Ok8NdP3W8TOMNliSUUGrLjesudvqA3nEEh9wH')

In [22]:
## Function accepts search term and then fetches the tweets for that term
def createTestData(search_string):
    try:
        tweets_fetched=api.GetSearch(search_string, count=500)
        # This will return a list with twitter.Status objects. These include
        # text, hashtags etc of the tweets that are fetched. 
        print("num tweets: "+str(len(tweets_fetched))+" term: "+search_string)
        # Since these tweets don't have sentiment labels yet we will 
        # keep the label empty
        return [status.text for status in tweets_fetched]
    except:
        print("Error")
        return None
    
search_string=input("Search for: ")
testData=createTestData(search_string)

Search for: '@HillaryClinton'
num tweets: 100 term: @HillaryClinton


In [23]:
# Process tweets, remove non-dictionary words, punctuation, links etc.

class PreProcessTweets:
    def __init__(self):
        self._stopwords=set(stopwords.words('english')+list(punctuation)+['rt', "'s", 'i'])
        
    def processTweets(self, list_of_tweets):
        # The list of tweets is a list of dictionaries which has the keys, "text" and "label"
        processedTweets=[]
        # Each tuple is a list of words + label.
        for tweet in list_of_tweets:
            processedTweet=self._processTweet(tweet)
            if len(processedTweet) > 0:
                processedTweets.append(processedTweet)  
        return processedTweets
    
    def _processTweet(self, tweet):
        # Convert to lowercase
        tweet=tweet.lower()
        # RemoveLinks
        tweet=re.sub('https?://[^\s]+','',tweet)
        # Remove '@' 
        tweet=re.sub(r'@[^\s]+','',tweet)
        # Replace #word with word
        tweet=re.sub(r'#([^\s]+)',r'\1',tweet)
        # Remove non-letters - unicode, random numbers
        tweet=re.sub("[^a-z]", " ",tweet)
        # Converts tweet to list of words
        tweet=word_tokenize(tweet)
        # Stem the words
        # Stemming is the process of converting words into their root form
        # For example: loving, loved will be converted to love
        stemmer=PorterStemmer()
        tweet=[stemmer.stem(word) for word in tweet]
        stripper = lambda word: word.strip()
        tweet = list(map(stripper, tweet))
        tweet = filter(None, tweet)
        # Remove stopwords
        tweet=[word for word in tweet if word not in self._stopwords]
        tweet= " ".join(tweet)
        return tweet
        '''
        # If word is not in wordnet, remove it.    
        for index, word in enumerate(tweet):
            if len(word) < 3:
                tweet[index] = 'i'
            var = wn.synsets(word)[:1] 
            if len(var) < 1:
                tweet[index] = 'i'
        # Rerun stopwords check as words that weren't in the dict were replaced
        # with 'i'... part of stopwords list
        return [word for word in tweet if word not in self._stopwords] '''

In [24]:
# Call Preprocessor
tweetProcessor=PreProcessTweets()
ppTestData=tweetProcessor.processTweets(testData)  

In [25]:
print(ppTestData)

[u'trump declar abov law today respons like see democrat dem congress sta', u'probabl becaus like hi favorit candid friend', u'say impeach hear would begun alreadi democrat right', u'thank hi team stakehold join us terrif meet thi', u'trump declar abov law today respons like see democrat dem congress stage walk', u'thi good depress', u'rememb time peopl said go away say aga', u'nachoqueen wash russianvodka', u'kahn wa lawyer kuru queen fukum', u'feliz de que ya sea miembro de voce vital maia', u'say fact', u'democrat rig elect favor sinc begin great republ republican alway fo', u'qanon post click pic pedovor pedogateisr houseofcard ticktock', u'ever fire lie dishonest uneth law violat lawyer ha grate sh', u'excel pardon ex navi sailor sue', u'excel pardon ex navi sailor sue', u'ever fire lie dishonest uneth law violat lawyer ha grate sh', u'surpris america dear involv thi sick mess read surpris g', u'thank hi team stakehold join us terrif meet thi', u'want say thank time appreci done c

In [26]:
# Run the classifier on downloaded tweets
ResultLabels=[]
ppTestData = set(ppTestData)
for tweet in ppTestData:
    Features=cv.transform([tweet]).toarray()
    Features = sc.transform(Features)
    ResultLabels.append(SVMClassifier.predict(Features)[0])
    
# Get sentiment positivity
if ResultLabels.count(1)>ResultLabels.count(0):
    print("Result Positive Sentiment: " + str(100*ResultLabels.count(1)/len(ResultLabels)))
else:
     print("Result Negative Sentiment: " + str(100*ResultLabels.count(0)/len(ResultLabels)))

Result Positive Sentiment: 69


In [21]:
print ResultLabels[:10]

[0, 0, 0, 1, 1, 0, 1, 0, 1, 1]
