In [6]:
import re
import pandas as pd
import nltk
nltk.download()

In [55]:
RE_SPACE = re.compile("\s+")
RE_HASHTAG = re.compile("[@#][_a-z0-9]+")
RE_EMOTICON = re.compile("(:-?\))|(:p)|(:d+)|(:-?\()|(:/)|(;-?\))|(<3)|(=\))|(\)-?:)|(:'\()|(8\))")
RE_HTTP = re.compile("http(s)?://[/\.a-z0-9]+")

In [44]:
def normalize(text):
    text = text.strip().lower()
    text = text.replace('&nbsp;', ' ')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('&amp;', '&')
    text = text.replace('&pound;', u'£')
    text = text.replace('&euro;', u'€')
    text = text.replace('&copy;', u'©')
    text = text.replace('&reg;', u'®')
    return text

In [68]:
def tokenize(text):
    tokenizer = nltk.tokenize.TweetTokenizer()
    text = tokenizer.tokenize(text)
    return text

In [78]:
def preStemClean(text):
    words = []
    for word in text:
        word = word.replace('#', '')
        word = word.replace('@', '')
        if RE_HTTP.match(word) == None:
            words.append(word)
    return words

In [72]:
def stem(text):
    stemmer = nltk.PorterStemmer()
    words = []
    for word in text:
        words.append(stemmer.stem(word))
    return words

In [101]:
def postStemClean(text):
    #stopwords = nltk.corpus.stopwords.words('english')
    stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him",
            "himself", "his", "how", "i", "in", "into", "is", "it", "its", "itself", "let", "me", "more", "most", "my",
            "myself", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "sha",
            "she", "should", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves",
            "then", "there", "there's", "these", "they", "this", "those", "through", "to", "until", "up", "very",
            "was", "we", "were", "what", "when", "where", "which", "while", "who","whom", "with", "would", "you",
            "your", "yours", "yourself", "yourselves",
            "n't", "'s", "'ll", "'re", "'d", "'m", "'ve",
            "above", "again", "against", "below", "but", 
                 #"cannot", 
            "down", "few", "if", "no", "nor", 
                 #"not", 
            "off", "out", "over", "same", "too", "under", 
                 #"why"
            "may"
                ]
    punctuation = ['.', ',', ';', ':', '&', '-', '->', '/', '\\']
    words = []
    for word in text:
        if word not in stopwords and word not in punctuation:
            words.append(word)
    return words

In [112]:
from collections import Counter
def prepareTweets(documents):
    wordCounter = Counter()
    tweets = []
    for i in documents.index:
        tweet = normalize(documents[i])
        tweet = tokenize(tweet)
        tweet = preStemClean(tweet)
        tweet = stem(tweet)
        # moze lematyzacja?
        tweet = postStemClean(tweet)
        wordCounter.update(tweet)
        tweets.append(tweet)
    return tweets, wordCounter

In [95]:
def getFeatureDictionary(wordCounter, minWordCount):
    commonWords = list([k for k, v in wordCounter.most_common() if v > minWordCount])
    dictionary = {}
    for word in commonWords:
        dictionary[word] = len(dictionary)
    return dictionary

In [135]:
from scipy.sparse import csr_matrix
def createBagOfWords(tweets, features, inputLabels):
    row = []
    col = []
    data = []
    outputLabels = []
    for i in range(len(tweets)):
        if not inputLabels.empty:
            outputLabels.append(inputLabels[i])
        tokens = tweets[i]
        
        for token in set(tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(1)
    return csr_matrix((data, (row, col)), shape=(len(tweets), len(features))), outputLabels

In [152]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

trainFile = pd.read_csv('train.csv', sep=',')
preparedTweets, wordsCounter = prepareTweets(trainFile.iloc[:, 2])
featureDictionary = getFeatureDictionary(wordsCounter, 1)
xTrain, yTrain = createBagOfWords(preparedTweets, featureDictionary, trainFile.iloc[:, 1])
labels = list(set(yTrain))
classifier = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=13)
classifier.fit(xTrain, yTrain)
predicted = classifier.predict(xTrain)

print("=================== Results ===================")
print("            Positive    Neutral     Negative   ")
print("F1       ", f1_score(yTrain, predicted, average=None, pos_label=None, labels=labels))
print("Precision", precision_score(yTrain, predicted, average=None, pos_label=None, labels=labels))
print("Recall   ", recall_score(yTrain, predicted, average=None, pos_label=None, labels=labels))

testFile = pd.read_csv('test.csv', sep=',')
preparedTweets, x = prepareTweets(testFile.iloc[:, 1])
xTest, yTest = createBagOfWords(preparedTweets, featureDictionary, pd.DataFrame())
predicted = classifier.predict(xTest)

submission = pd.concat([testFile.iloc[:,  0], pd.DataFrame({'Category': predicted})], axis=1)
submission.to_csv('submission.csv', sep=',', index = False, index_label = False)

            Positive    Neutral     Negative   
F1        [ 0.95686275  0.95060373  0.95659377  1.        ]
Precision [ 0.99846547  1.          0.91709022  1.        ]
Recall    [ 0.91858824  0.90585774  0.99965374  1.        ]
