### Naive Bayesian Classifier (Sadie's Version)
Sadie Crawford and Jonah Kornberg
#### References:
- https://www.datacamp.com/community/tutorials/simplifying-sentiment-analysis-python
- Specific ones are scattered throughout the code next to revelant locations

#### Importing Training and Testing Data

In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [2]:
test = pd.read_csv('data/test.csv')
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


#### Tokenizing Tweets, Removing Stopwords, Lowercase Stemming

In [3]:
# https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk
# NoneType Errors Fix -> https://stackoverflow.com/questions/1207406/how-to-remove-items-from-a-list-while-iterating
# https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
# http://www.nltk.org/api/nltk.tag.html
# http://www.nltk.org/api/nltk.stem.html
# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
import string

def convert_pos(pos):
    if pos[0] == 'C':
        return wordnet.ADJ
    elif pos[0] == 'J':
        return wordnet.ADJ
    elif pos[0] == 'A':
        return wordnet.ADJ
    elif pos[0] == 'V':
        return wordnet.VERB
    elif pos[0] == 'R':
        return wordnet.ADV
    else:
        return wordnet.NOUN

stop_words = stopwords.words('english')
for punct in string.punctuation:
    stop_words.append(punct)
    
tokenized = []
for i in range(0, len(train['text'])):
    words = word_tokenize(train['text'][i])
    words = nltk.pos_tag(words)
    #important_words = [PorterStemmer().stem(word) for word in words if word not in stop_words]
    important_words = [lemmatizer.lemmatize(word.lower(), convert_pos(pos))
                       for (word, pos) in words
                       if word not in stop_words]
    tokenized.append(important_words)
train['text'] = tokenized

tokenized = []
for i in range(0, len(test['text'])):
    words = word_tokenize(test['text'][i])
    words = nltk.pos_tag(words)
    #important_words = [PorterStemmer().stem(word) for word in words if word not in stop_words]
    important_words = [lemmatizer.lemmatize(word.lower(), convert_pos(pos))
                       for (word, pos) in words
                       if word not in stop_words]
    tokenized.append(important_words)
test['text'] = tokenized

In [4]:
train['text']

0       [our, deed, reason, #, earthquake, may, allah,...
1        [forest, fire, near, la, ronge, sask, ., canada]
2       [all, resident, ask, 'shelter, place, ', notif...
3       [13,000, people, receive, #, wildfire, evacuat...
4       [just, get, send, photo, ruby, #, alaska, smok...
                              ...                        
7608    [two, giant, crane, hold, bridge, collapse, ne...
7609    [@, aria_ahrary, @, thetawniest, the, control,...
7610    [m1.94, [, 01:04, utc, ], ?, 5km, s, volcano, ...
7611    [police, investigating, e-bike, collided, car,...
7612    [the, late, :, more, home, raze, northern, cal...
Name: text, Length: 7613, dtype: object

#### Finding Important Words

In [5]:
# https://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis
# https://kite.com/python/docs/nltk.FreqDist
import nltk
list_of_words = []
for words in train['text']:
    for word in words:
        list_of_words.append(word)
common_words = nltk.FreqDist(word.lower() for word in list_of_words)

list_of_words = []
for words in test['text']:
    for word in words:
        list_of_words.append(word)
common_words = nltk.FreqDist(word.lower() for word in list_of_words)

In [6]:
common_words = common_words.most_common(1500)
popular_words = []
for word in common_words:
    popular_words.append(word[0])
popular_words[0:10]

[':', 'http', '#', '?', '.', '@', 'i', '!', '...', "'s"]

#### Formatting Classifier Input

In [7]:
# defining function to be used below
def word_check(tweet):
    tweet = set(tweet)
    word_check_dict = {} # creating a dictionary of whether tweet has a word or not
    for word in popular_words:
        if word in tweet:
            word_check_dict[word] = 1
        else:
            word_check_dict[word] = 0
    return word_check_dict

In [8]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.itertuples.html#pandas.DataFrame.itertuples
training_tuples = []
for row in train.itertuples():
    training_tuples.append((word_check(row[4]), row[5]))
print(training_tuples[0])

testing_tuples = []
for row in test.itertuples():
    testing_tuples.append(word_check(row[4]))
print(testing_tuples[0])





#### Training Classifier

In [9]:
classifier = nltk.NaiveBayesClassifier.train(training_tuples)

In [10]:
classifier.show_most_informative_features(10)

Most Informative Features
               hiroshima = 1                   1 : 0      =     74.8 : 1.0
                  bomber = 1                   1 : 0      =     51.8 : 1.0
                malaysia = 1                   1 : 0      =     42.0 : 1.0
                wildfire = 1                   1 : 0      =     41.1 : 1.0
                   spill = 1                   1 : 0      =     37.6 : 1.0
                wreckage = 1                   1 : 0      =     35.8 : 1.0
                outbreak = 1                   1 : 0      =     35.8 : 1.0
                   saudi = 1                   1 : 0      =     31.4 : 1.0
                  atomic = 1                   1 : 0      =     29.5 : 1.0


#### Classifying and Saving Test Tuples

In [11]:
# https://docs.python.org/3/library/csv.html
# https://stackoverflow.com/questions/3348460/csv-file-written-with-python-has-blank-lines-between-each-row
import csv

with open('data/submission_test.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(['id', 'target'])
    for i in range(0, len(test['id'])):
        writer.writerow([test['id'][i], classifier.classify(testing_tuples[i])])

#### Out of Curiosity, Looking at Accuracy on Reclassifying Training Tuples

In [12]:
nltk.classify.accuracy(classifier, training_tuples)

0.8112439248653619