In [1]:
import pandas as pd
import nltk 

## Data pre-processing

### 1.  Delete characters that will not be meaningful to machine learning algorithms

In [2]:
import re 
def preprocessTweet(tweet):
    text = tweet;
       
    # Delete mentions:
    text = re.sub(r'@\w+', '', text)
    
    # Delete URLs:
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    
    # Delete digits and symbols such as points, hashtags, commas, etc:
    text = re.sub('[^a-zA-Z\s]', '', text)
    
    # Delete extra white spaces:
    #text = re.sub("\s+", '', text)
    #text = text.lstrip()
    #text = text.rstrip()
    
    # Make text lower-case
    text = text.lower()
        
    return text

### 2. Lexicon Normalization

In [3]:
# Delete stop words
from nltk.corpus import stopwords

def deleteStopWords(tokenized_text):
    stop_words = set(stopwords.words("english"))
    filtered_text = []
    for word in tokenized_text:
        if word not in stop_words:
            filtered_text.append(word)
    
    return filtered_text

In [4]:
# Reduce words to their word root word or chops off the derivational affixes
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def stemmingText(tokenized_text):
    ps = PorterStemmer()

    stemmed_words = []
    for word in tokenized_text:
        stemmed_words.append(ps.stem(word))

    return stemmed_words

In [5]:
# Reduce words to their base word, which is linguistically correct lemmas
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatizationText(tokenized_text):
    lem = WordNetLemmatizer()
    
    lemm_words = []
    for word in tokenized_text:
        lemm_words.append(lem.lemmatize(word,"v"))

    return lemm_words

### 3. Tweet Tokenization

In [6]:
# Create a list of tweet words
from nltk.tokenize import word_tokenize

def tokenize(text):
    tokenized_text = word_tokenize(text)   
    #print(tokenized_text)

    # delete stop words
    tokenized_text = deleteStopWords(tokenized_text)
    
    # stemming
    tokenized_text = stemmingText(tokenized_text)
    
    # delete stop words
    tokenized_text = lemmatizationText(tokenized_text)
    #print(tokenized_text)
    
    return tokenized_text

In [7]:
def createTokenizedList(df):
    tweets = []
    
    for index, row in df.iterrows():
        # delete unnecessary characters
        #print(row['tweet'])
        text = preprocessTweet(row['tweet'])
        #print(text)
        
        tokenized_text = tokenize(text)
        tweets.append((tokenized_text, row['label']))

    return tweets

# The Sentiment Analysis: Text Classification with Naive Bayes Classifier
Using the natural language toolkit NLTK https://www.nltk.org/api/nltk.classify.html

### Create Classifier

In [8]:
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

def extract_features(document):
    document_words = set(document)
    features = {}
    
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
        
    return features

# The Sentiment Analysis: Process Data

### 1. Load pre-classified data

In [9]:
# Manually classified tweets retrieved from here: https://datahack.analyticsvidhya.com
# Cyrrent model
# 0 - positive
# 1 - negative
# TO_DO: find a more suitable 'hate' database
df_labeled_tweets = pd.read_csv('labeled_tweets.csv')

### Create tokenized list of tweets

In [10]:
df_token_tweets = createTokenizedList(df_labeled_tweets.head(2000))

### Obtain tweets that should be classified

In [23]:
# Here should be functions to crawl twitter accounts
# Test tweets retrieved from here: https://datahack.analyticsvidhya.com
df_tweets = pd.read_csv('test_tweets.csv')

### Train set

In [12]:
word_features = get_word_features(get_words_in_tweets(df_token_tweets))
training_set = nltk.classify.apply_features(extract_features, df_token_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

### Classify tweet

In [26]:
# tweet = "@user @user lumpy says i am a . prove it lumpy."
# tweet = preprocessTweet(tweet)

# label = classifier.classify(extract_features(tweet.split()))
# print(label)

for index, row in df_tweets.head(1000).iterrows():
    text = preprocessTweet(row["tweet"])
    tokenized_text = tokenize(text)
    print(classifier.classify(extract_features(tokenized_text)))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
