In [51]:
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import pandas as pd

from collections import Counter

import re

import string

In [2]:
location = './twitter_data/train2017.tsv'
df = pd.read_csv(location , sep="\t" , header = None)

In [54]:
ndf = df.head(10) #takes the first x entries

dl = ndf.values.tolist()
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
tokens = []
fusedTokens = []
positives = []
negatives = []
neutrals = []

for item in dl:
    
    tweet = item[3]
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    temp = tknzr.tokenize(tweet)
    
    print(temp)
    
#     temp = [w.lower() for w in temp] #convert to lower case
    
    stop_words = stopwords.words('english')    #sets stop words
    newStopWords = ["i'm" , "he's" , "she's" , "we're" , "you're" , "they're"]
    stop_words.extend(newStopWords)
    stop_words = set(stop_words)
    
    temp = [w for w in temp if not w in stop_words]  #removes stop words
    
    table = str.maketrans('', '', string.punctuation) #remove punctuation
    temp = [w.translate(table) for w in temp]
    
    temp = [word for word in temp if word.isalpha()] #remove remaining tokens that are not alphabetic
    
#     porter = PorterStemmer() #stemming (not that useful)
#     temp = [porter.stem(word) for word in temp]

    if item[2] == "positive":  #need to give the words positive and negative weight so that the most common words in positive posts is not "tomorrow"
        positives.extend(temp)
    elif item[2] == "negative":
        negatives.extend(temp);
    elif item[2] == "neutral":
        neutrals.extend(temp)
        
    fusedTokens.extend(temp)
    tokens.append(temp)
    
print("\n")
print(tokens)
print("\n")
print(fusedTokens)

['gas', 'by', 'my', 'house', 'hit', '.', '39', '!', '!', '!', "i'm", 'going', 'to', 'chapel', 'hill', 'on', 'sat', '.', ':)']
['theo', 'walcott', 'is', 'still', 'shit', ',', 'watch', 'rafa', 'and', 'johnny', 'deal', 'with', 'him', 'on', 'saturday', '.']
['its', 'not', 'that', "i'm", 'a', 'gsp', 'fan', ',', 'i', 'just', 'hate', 'nick', 'diaz', '.', "can't", 'wait', 'for', 'february', '.']
['iranian', 'general', 'says', "israel's", 'iron', 'dome', "can't", 'deal', 'with', 'their', 'missiles', '(', 'keep', 'talking', 'like', 'that', 'and', 'we', 'may', 'end', 'up', 'finding', 'out', ')']
['tehran', ',', 'mon', 'amour', ':', 'obama', 'tried', 'to', 'establish', 'ties', 'with', 'the', 'mullahs']
['i', 'sat', 'through', 'this', 'whole', 'movie', 'just', 'for', 'harry', 'and', 'ron', 'at', 'christmas', '.', 'ohlawd']
['with', 'j', 'davlar', '11th', '.', 'main', 'rivals', 'are', 'team', 'poland', '.', 'hopefully', 'we', 'an', 'make', 'it', 'a', 'successful', 'end', 'to', 'a', 'tough', 'week', 

In [56]:
count = Counter(fusedTokens)
print("Generally most common words : " , count.most_common(10))

count = Counter(positives)
print("Most common words found in positive posts : " , count.most_common(10))

count = Counter(negatives)
print("Most common words found in negative posts : " , count.most_common(10))

count = Counter(neutrals)
print("Most common words found in neutral posts : " , count.most_common(10))

Generally most common words :  [('sat', 2), ('deal', 2), ('cant', 2), ('february', 2), ('talking', 2), ('may', 2), ('end', 2), ('college', 2), ('superbowl', 2), ('dallas', 2)]
Most common words found in positive posts :  [('gas', 1), ('house', 1), ('hit', 1), ('going', 1), ('chapel', 1), ('hill', 1), ('sat', 1), ('j', 1), ('davlar', 1), ('main', 1)]
Most common words found in negative posts :  [('deal', 2), ('cant', 2), ('talking', 2), ('may', 2), ('college', 2), ('superbowl', 2), ('dallas', 2), ('theo', 1), ('walcott', 1), ('still', 1)]
Most common words found in neutral posts :  [('tehran', 1), ('mon', 1), ('amour', 1), ('obama', 1), ('tried', 1), ('establish', 1), ('ties', 1), ('mullahs', 1), ('sat', 1), ('whole', 1)]
