# Backend Code to Classify Tweets
# Using Support Vector Machine Classification

#### SciKit Learn Library: http://scikit-learn.org/stable/index.html

### Future Features:
Save Classifier: http://scikit-learn.org/stable/modules/model_persistence.html

In [None]:
# Import all the necessary libararies
import pandas as pd
import numpy as np
import re
from string import punctuation
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import twitter

### Import Training Corpus:
This corpus has already been preprocessed by create_trainingCorpus

In [None]:
# Get the data-set
data_set = pd.read_csv("processedCorpus.csv", names=["Tweet", "Sentiment"])
# This is our independent variable 'X'- The tweet data
# Using Pandas library get the tweet text from the first column of csv file
# [row,column]
X = data_set.iloc[:, :-1]
# This is our dependent variable 'y'- Positive Negative
# Use Pandas library to get the tweet from the last column
y = data_set.iloc[:, -1]

In [None]:
# Convert the Categroical data (negative and positive) to numerical data 0 and 1
# Support Vector CLassifier needs floats to classify data, not strings.
# LabelEncoder does encodes the categorical data here two categories(negative, positive)
# to numerical data 0 and 1. The categorical data is converted into numerical data alphabetically.
# There are three common methods used here: fit, fit_transfrom, transform in this LabelEncoder class
# method: fit = > will not modify the data but will convert them into numerical value into memory.
# method: transform => will use numerical values from the memory to convert the text data to numerical data.
# method: fit_transform  => combines above functions
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [None]:
corpus = list(X["Tweet"]) # convert data to list

###  Support Vector Machine Classifier

In [None]:
# Count vectorizer creates the bag of words model
# 1500 most common words is taken, takes in to account term frequency
cv = CountVectorizer(max_features=1000)

In [None]:
# All the text data is converted into bag of words model
# bag of words -> https://machinelearningmastery.com/gentle-introduction-bag-words-model/
# represents training data as the absence or occurance of words in feature vector, 1's and 0's
# Use sklearn's libaray to create a bag of words, term document matrix
# use numPy to convert bag of words to ndimensional array
X = cv.fit_transform(corpus).toarray()
# Scale the model. Transform data so that the mean value is 0, standard dev of 1
# fit -> Calculate mean and standard deviation
# transform -> use the values to scale the data as above
# fit_transform -> combine the two above steps
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
# Splits ndimensional array into random train and test subsets.
# Split 80% to training, 20% for testing.
# X_train, Y_train, 80% of training data and their values.
# X_test, Y_test, remaining 20% of training data and their values.
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Support Vector Classification library, scikitlearn.
SVMClassifier=SVC()
# Train the classifier with trainingdata
SVMClassifier.fit(X_train, Y_train)

#### Classifier Accuracy
For help understanding the confusion matrix- a tool analysising accuracy:
https://en.wikipedia.org/wiki/Confusion_matrix

In [None]:
# Predict whether the test data is positive or negative
Y_pred = SVMClassifier.predict(X_test)

In [None]:
# Compare prediction of the test data with the actual labels.
# Accuracy = amount that is correct/total amount
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred) 

In [None]:
# Confusion matrix to analyse accuracy
# Actual 0,1 being columns
# Predicted 0,1 being rows
# See https://en.wikipedia.org/wiki/Confusion_matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_pred)

### Test Classifier
Data entered here is not preprocessed, so may not be accurate.

In [None]:
# Test prediction
print("Predict The sentiment")
data = input("Enter your data to get the sentiment: ")
data = [data]
# Convert data to bag of words model, put that
# in ndimensional numPy array
array = cv.transform(data).toarray()

r = SVMClassifier.predict(array)
# 0 is negative, 1 is positive
print(r)

### Get Test Data
Using python-twitter library to grab tweets - this avoids the annoyance of processing .JSON files.

In [None]:
# Twitter API keys are needed to access tweets - both from feature vector and 
# tweets for classification.
api = twitter.Api(consumer_key='6b9ebwaNU4DDa9G9xF1FhrZQt',
                 consumer_secret='2prgddykMj2b7b9zTeN78BBhdrgdaNxjtSyyoo8iNRzKAZhzMX',
                 access_token_key='817521154847969280-bc6J796tc0cRjlhigiRZIoQVIzeW2Hf',
                 access_token_secret='WAZy2gZ9Ok8NdP3W8TOMNliSUUGrLjesudvqA3nEEh9wH')

In [None]:
# Raw query to search terms via data
# see: http://python-twitter.readthedocs.io/en/latest/searching.html
'''
Donald Trump = "l=&q=Donald%20Trump%20since%3A2018-06-05%20until%3A2018-06-06&count=100"
Hillary C = "l=&q=hillary%20clinton%20since%3A2018-06-05%20until%3A2018-06-06&count=100"
Barack Obama = "l=&q=barack%20obama%20since%3A2018-06-05%20until%3A2018-06-06&count=100"
Pope Francis = "l=&q=pope%20francis%20pontifex%20since%3A2018-06-05%20until%3A2018-06-06&count=100"
'''

In [None]:
# Process tweets, remove non-dictionary words, punctuation, links etc.

class PreProcessTweets:
    def __init__(self):
        self._stopwords=set(stopwords.words('english')+list(punctuation)+['rt', "'s", 'i'])
        
    def processTweets(self, list_of_tweets):
        # The list of tweets is a list of dictionaries which has the keys, "text" and "label"
        processedTweets=[]
        # Each tuple is a list of words + label.
        for tweet in list_of_tweets:
            processedTweet=self._processTweet(tweet)
            if len(processedTweet) > 0:
                processedTweets.append(processedTweet)  
        return processedTweets
    
    def _processTweet(self, tweet):
        # Convert to lowercase
        tweet=tweet.lower()
        # RemoveLinks
        tweet=re.sub('https?://[^\s]+','',tweet)
        # Remove '@' 
        tweet=re.sub(r'@[^\s]+','',tweet)
        # Replace #word with word
        tweet=re.sub(r'#([^\s]+)',r'\1',tweet)
        # Remove non-letters - unicode, random numbers
        tweet=re.sub("[^a-z]", " ",tweet)
        # Converts tweet to list of words
        tweet=word_tokenize(tweet)
        # Stem the words
        # Stemming is the process of converting words into their root form
        # For example: loving, loved will be converted to love
        stemmer=PorterStemmer()
        tweet=[stemmer.stem(word) for word in tweet]
        stripper = lambda word: word.strip()
        tweet = list(map(stripper, tweet))
        tweet = filter(None, tweet)
        # Remove stopwords
        tweet=[word for word in tweet if word not in self._stopwords]
        tweet= " ".join(tweet)
        return tweet
        # Old code to remove words that aren't in dict, but decided against as
        # this may reduce sentimental emotional words- eg. haha
        '''
        # If word is not in wordnet, remove it.    
        for index, word in enumerate(tweet):
            if len(word) < 3:
                tweet[index] = 'i'
            var = wn.synsets(word)[:1] 
            if len(var) < 1:
                tweet[index] = 'i'
        # Rerun stopwords check as words that weren't in the dict were replaced
        # with 'i'... part of stopwords list
        return [word for word in tweet if word not in self._stopwords] '''

In [None]:
def getSentiment():
    dates = ['06-06', '06-05', '06-04', '06-03', '06-02', '06-01', '05-31', '05-30', '05-29', '05-28', '05-27']
    count = 1
    sentimentData = []
    while count < len(dates):
        datePrev=dates[count]
        dateCurrent=dates[count-1]
        count += 1
        sentimentData.append([dateCurrent, _determineSentiment(dateCurrent, datePrev)])
    return sentimentData
    

def _determineSentiment(dateCurrent, datePrev):
    tweetList=[]
    # try-catch block to avoid errors
    try:
        # For loop possibility to search for more than 100 tweets
        for counter in range(1):
            tweets_fetched=api.GetSearch(raw_query="l=&q=pope%20francis%20pontifex%20since%3A2018-"+datePrev+"%20until%3A2018-"+dateCurrent+"&count=100")
            # This will return a list with twitter.Status objects. These include
            # text, hashtags etc of the tweets that are fetched.
            for status in tweets_fetched:
                tweetList.append(status.text)
        #print("num tweets: "+str(len(tweetList)))
    except:
        print("Error")
    testData=tweetList
    
    # Preprocess tweets
    tweetProcessor=PreProcessTweets()
    ppTestData=tweetProcessor.processTweets(testData)

    
    # Run the classifier on downloaded tweets
    # convert test data to bag of words model
    # create ndimensional matrix from model using numPy
    # scale the bag of words so standard dev is 1, mean is 0.
    ResultLabels=[]
    ppTestData = set(ppTestData)
    for tweet in ppTestData:
        Features=cv.transform([tweet]).toarray()
        Features=sc.transform(Features)
        ResultLabels.append(SVMClassifier.predict(Features)[0])

    # Get sentiment positivity
    return (100*ResultLabels.count(1)/len(ResultLabels))

In [None]:
sentimentData=getSentiment()
print sentimentData

#### Sentiment CSV files for each file are made manually... 

#### Return the elements of the SVM Labels, where the sentiment is positive. These values are then found in the testData to obeserve the accuracy against the testData

In [None]:
counter = 0
indexElements = []
while counter < len(ResultLabels):
    if ResultLabels[counter] == 1:
        indexElements.append(counter)

print indexElements