# NLP Sentiment Analysis Test

In [1]:
import nltk

In [2]:
# Tokenisation
from nltk.tokenize import word_tokenize,sent_tokenize

#tokenized = word_tokenize(text)

# Stop Words
import string
from nltk.corpus import stopwords
def stop_words(tokenized):
    punctuation = list(string.punctuation) #['.',',',':',';','?','!']
    stop_words = list(stopwords.words("english"))
    #stop_words = ['a','as','at','they','the','his','her','so','and','were','from',
    #              'that','of','in','only','with','to']
    included_punctuation = [word for word in tokenized 
                            if word.lower() in punctuation]
    included_stop_words = [word for word in tokenized 
                           if word.lower() in stop_words]
    everything_else = [word for word in tokenized 
                       if word.lower() not in set(stop_words + punctuation)]
    return everything_else,included_stop_words,included_punctuation

# Stemming and Lemming
from nltk.stem import PorterStemmer, LancasterStemmer
def stemlem(tokenized):
    stemmer = PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    return [ [w,stemmer.stem(w),lemmatizer.lemmatize(w)] 
    for w in list(set(tokenized))]
    
# POS Tagging
def pos_tagging(tokenized):
    tagged =  nltk.pos_tag(list(set(tokenized)))
    tagged = [(t[0],t[1]) for t in tagged]
    return tagged

# Bi/Trigrams
from nltk import bigrams, trigrams
def grams(tokenized):
    return list(bigrams(tokenized)), list(trigrams(stop_words(tokenized)[0]))
    
# Frequency related statistics
from collections import Counter
import math
def frequency_statistics(text):
    words = word_tokenize(text)
    # Organisation
    words = list(set(stop_words(words)[0])) +\
    list(set(stop_words(words)[1])) +\
    list(set(stop_words(words)[2]))
    sents = sent_tokenize(text) 
    #filtered = stop_words(words)
    tf= {word:Counter(word_tokenize(text))[word] for word in words} #Term Frequency
    df = {}
    for word in words:
        df[word] = 0
        for sent in sents:
            if word in sent: df[word]+=1
    idf = {word: math.log(len(sent)/df,10) for (word, df) in  df.items()}
    tfidf = {word: tf[word]*idf[word] for word in words}
    #Processing
    stats = []
    for word in words:
        stats += [[word,tf[word],df[word],idf[word],tfidf[word]]]
    return stats

In [40]:
from nltk.corpus import movie_reviews
lemmatizer = nltk.WordNetLemmatizer()
'''
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
'''
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

import random
random.shuffle(documents)

In [41]:
all_words = []
for w in movie_reviews.words():
  all_words.append(w.lower())

Import Data

In [179]:
import pandas as pd
df = pd.read_csv("NewsSentiments.csv")

from nltk.corpus import stopwords
stop_words = list(stopwords.words("english"))
import re
def lower(words):
    #print(words)
    return [word.lower() for word in words 
            if #not bool(re.match(r"0-9",word)) or 
            word not in stop_words ]
from nltk import word_tokenize
documents = [(lower(word_tokenize(df.iloc[i]["Text"])),df.iloc[i]["Sentiments"])
            for i in df.index.values]
import random
random.shuffle(documents)

In [180]:
import re
bool(re.match("\d","500,000"))
documents

[(['foote',
   'mineral',
   '&',
   'lt',
   ';',
   'fte',
   '>',
   'sells',
   'cambridge',
   'plant',
   'foote',
   'mineral',
   'co',
   'said',
   'signed',
   'letter',
   'intent',
   'shieldalloy',
   'corp',
   ',',
   'wholly-owned',
   'subsidiary',
   '&',
   'lt',
   ';',
   'metallurg',
   'inc',
   '>',
   ',',
   'sale',
   'cambridge',
   ',',
   'ohio',
   ',',
   'business',
   '.',
   'the',
   'company',
   'said',
   'sale',
   ',',
   'explained',
   'greater',
   'detail',
   'definitive',
   'agreement',
   'signed',
   ',',
   'part',
   'foote',
   "'s",
   'previously-announced',
   'plan',
   'sell',
   'entire',
   'company',
   '.'],
  1),
 (['cantrex',
   'unit',
   'to',
   'merge',
   'with',
   'ontario',
   'group',
   '(',
   'groupe',
   'cantrex',
   'inc',
   ')',
   'said',
   'plans',
   'merge',
   'new',
   'wholly-owned',
   'subsidiary',
   'merger',
   'agreement',
   '(',
   'cap',
   'appliance',
   'purchasers',
   'inc',
   ')',


In [181]:
all_words = []
for i in documents:
    all_words+=i[0]

In [182]:
import nltk
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]

print(all_words)
print(word_features)

def feature_extractor(review):
  words = set(review)
  features = {}
  for w in word_features:
    features[w] = (w in words)
  return features

featureset = [(feature_extractor(review),sentiment) 
              for (review, sentiment) in documents]

<FreqDist with 5141 samples and 25984 outcomes>
['foote', 'mineral', '&', 'lt', ';', 'fte', '>', 'sells', 'cambridge', 'plant', 'co', 'said', 'signed', 'letter', 'intent', 'shieldalloy', 'corp', ',', 'wholly-owned', 'subsidiary', 'metallurg', 'inc', 'sale', 'ohio', 'business', '.', 'the', 'company', 'explained', 'greater', 'detail', 'definitive', 'agreement', 'part', "'s", 'previously-announced', 'plan', 'sell', 'entire', 'cantrex', 'unit', 'to', 'merge', 'with', 'ontario', 'group', '(', 'groupe', ')', 'plans', 'new', 'merger', 'cap', 'appliance', 'purchasers', 'woodstock', '400', 'electronics', 'retailers', 'it', 'shareholders', 'receive', '140,700', 'first', 'preferred', 'shares', 'entitling', 'holders', '6.05', 'dlrs', 'per', 'share', 'equivilant', 'class', 'a', 'subordinate', 'voting', 'effective', 'april', 'one', 'subject', 'shareholder', 'approval', 'handy', 'and', 'harman', 'hnh', '4th', 'qtr', 'loss', 'shr', '51', 'cts', 'vs', 'three', 'net', '7,041,000', '467,000', 'rev', '138

In [183]:
training_set, testing_set = featureset[:1900],featureset[1900:]
training_set, testing_set = featureset[:200],featureset[200:]


In [184]:
classifier_challenge = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy:", nltk.classify.accuracy(classifier_challenge, testing_set))
classifier_challenge.show_most_informative_features(15)

Classifier accuracy: 0.64
Most Informative Features
                   crude = True               -1 : 0      =     15.9 : 1.0
                     net = True                0 : -1     =     12.8 : 1.0
                 billion = True                1 : -1     =     11.6 : 1.0
                     shr = True                0 : -1     =     11.5 : 1.0
                    revs = True                0 : -1     =      9.5 : 1.0
                     4th = True                0 : -1     =      8.9 : 1.0
                     qtr = True                0 : -1     =      8.9 : 1.0
                    rose = True                1 : 0      =      7.5 : 1.0
                    week = True               -1 : 1      =      7.4 : 1.0
                december = True                1 : -1     =      7.2 : 1.0
                    base = True               -1 : 0      =      7.1 : 1.0
                     div = True                0 : -1     =      6.9 : 1.0
                   prior = True                0

https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk

In [170]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
#lower(word_tokenize(df.iloc[i]["Text"]))
text_counts= cv.fit_transform(df['Text'])

In [171]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['Sentiments'], test_size=0.3, random_state=1)

In [172]:
from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.6


In [188]:
X_test

<90x4166 sparse matrix of type '<class 'numpy.int64'>'
	with 4068 stored elements in Compressed Sparse Row format>

In [186]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
#lower(word_tokenize(df.iloc[i]["Text"]))
text= cv.fit_transform(["Trump has once against caused a major dispute among countries"])
text#clf.predict(text)

<1x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

https://towardsdatascience.com/natural-language-processing-count-vectorization-with-scikit-learn-e7804269bb5e

In [189]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Create our vectorizer
vectorizer = CountVectorizer()

# All data
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'))

# Get the training vectors
vectors = vectorizer.fit_transform(newsgroups_train.data)

# Build the classifier
clf = MultinomialNB(alpha=.01)

#  Train the classifier
clf.fit(vectors, newsgroups_train.target)

# Get the test vectors
vectors_test = vectorizer.transform(newsgroups_test.data)

# Predict and score the vectors
pred = clf.predict(vectors_test)
acc_score = metrics.accuracy_score(newsgroups_test.target, pred)
f1_score = metrics.f1_score(newsgroups_test.target, pred, average='macro')

print('Total accuracy classification score: {}'.format(acc_score))
print('Total F1 classification score: {}'.format(f1_score))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Total accuracy classification score: 0.6460435475305364
Total F1 classification score: 0.6203806145034193
