In [41]:
# Import all necessary libraries
from sklearn.preprocessing import LabelEncoder
import re
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

In [47]:
# Using Sentiment 140 Training Corpus, has 1,600,000 classified tweets - based
# on whether the tweet used a :) or :( emoticon
def createTrainingCorpus(corpusFile):
    import csv
    corpus=[]
    with open(corpusFile, 'rb') as csvfile:
        lineReader = csv.reader(csvfile,delimiter=',',quotechar="\"")
        for row in lineReader:
            corpus.append({"text":row[5],"label":int(row[0])})
    
    # Only 10,000 positive and negative tweets respectively. Processing
    # 1,600,000 tweets will take too long to train, 
    # and possibly introduce overfitting
    trainingData=[]
    for label in [0,4]:
        i=1
        for tweet in corpus:
            if tweet["label"]==label and i<=5000:
                trainingData.append(tweet)
                i+=1
    return trainingData

corpusFile="/Users/Josh/Desktop/Software Project/trainingCorpus/trainingData1600000.csv"
trainingData=createTrainingCorpus(corpusFile)

In [48]:
# Process tweets, remove non-dictionary words, punctuation, links etc.

class PreProcessTweets:
    def __init__(self):
        self._stopwords=set(stopwords.words('english')+list(punctuation)+['rt', "'s", 'i'])
        
    def processTweets(self, list_of_tweets):
        # The list of tweets is a list of dictionaries which has the keys, "text" and "label"
        processedTweets=[]
        # Each tuple is a list of words + label.
        for tweet in list_of_tweets:
            processedTweet={"text":self._processTweet(tweet["text"]),"label":tweet["label"]}
            if len(processedTweet["text"]) > 0:
                processedTweets.append(processedTweet)  
        return processedTweets
    
    def _processTweet(self, tweet):
        # Convert to lowercase
        tweet=tweet.lower()
        # RemoveLinks
        tweet=re.sub('https?://[^\s]+','',tweet)
        # Remove '@' 
        tweet=re.sub(r'@[^\s]+','',tweet)
        # Replace #word with word
        tweet=re.sub(r'#([^\s]+)',r'\1',tweet)
        # Remove non-letters
        tweet=re.sub("[^a-z]", " ",tweet)
        # Converts tweet to list of words
        tweet=word_tokenize(tweet)
        # Stem the words
        # Stemming is the process of converting words into their root form
        # For example: loving, loved will be converted to love
        stemmer=PorterStemmer()
        tweet=[stemmer.stem(word) for word in tweet]
        stripper = lambda word: word.strip()
        tweet = list(map(stripper, tweet))
        tweet = filter(None, tweet)
        # Remove stopwords
        return [word for word in tweet if word not in self._stopwords]
        '''
        # If word is not in wordnet, remove it.    
        for index, word in enumerate(tweet):
            if len(word) < 3:
                tweet[index] = 'i'
            var = wn.synsets(word)[:1] 
            if len(var) < 1:
                tweet[index] = 'i'
        # Rerun stopwords check as words that weren't in the dict were replaced
        # with 'i'... part of stopwords list
        return [word for word in tweet if word not in self._stopwords] '''

In [49]:
# Call Preprocessor
tweetProcessor=PreProcessTweets()
ppTrainingData=tweetProcessor.processTweets(trainingData)

In [50]:
# Store processed tweets in CSV, as  processing tweets takes considerable
# time, this will mean we only have to do it once.
def storePPTrainingData(ppTrainingData,tweetDataFile):
    import csv
    with open(tweetDataFile,'wb') as csvfile:
        linewriter=csv.writer(csvfile,delimiter=',',quotechar="\"")
        convertLabels={4:"positive",0:"negative"}
        for tweet in ppTrainingData:
            try: 
                linewriter.writerow([_convertList(tweet["text"]), convertLabels[tweet["label"]]])
            except Exception,e:
                print e
                
def _convertList(tweetList):
    sentence=''
    for word in tweetList:
        sentence += word + ' '
    return sentence
                
tweetDataFile="/Users/Josh/Desktop/Software Project/trainingCorpus/processedCorpus.csv"
storePPTrainingData(ppTrainingData,tweetDataFile)