In [1]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re    
import string
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import pandas as pd
import warnings
warnings.simplefilter("ignore")
from nltk.probability import FreqDist
import collections

In [2]:
def clean(s):
    s=re.sub('\d+', '',s)
    s=re.sub(r'http\S+', '',s)
    s=re.sub('\'s',' is',s)
    s=re.sub('\W',' ',s)
    s=s.lstrip().rstrip()
    s=s.lower()
    return s

In [3]:
def tok(s):
    return nltk.word_tokenize(s)

In [4]:
def stemandlem(s):
    stemmed=[lemmatizer.lemmatize(stemmer.stem(e)) for e in s]
    return stemmed

In [5]:
def remove_stopwords(s):
    clean=[e for e in s if e not in STOP_WORDS]
    return clean        

In [6]:
def pipeline(s):
    a=clean(s)
    b=tok(a)
    c=stemandlem(b)
    d=remove_stopwords(c)
    return d

In [7]:
data=pd.read_csv('Sentiment140.csv')
data.shape

(1600000, 6)

In [8]:
dat=data.iloc[0:20000,]

In [9]:
dat.shape

(20000, 6)

In [10]:
dat.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [11]:
dat['text_processed']=dat.text.apply(pipeline)

In [12]:
dat.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, awww, bummer, shoulda, got, david..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, t, updat, facebook, text, cri, result,..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball, manag, save..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behav, m, mad, whi, becaus, t]"


In [13]:
dat.shape

(20000, 7)

In [15]:
fdist=FreqDist()

In [16]:
processcomb=[f for e in dat.text_processed for f in e]
for e in processcomb:
    fdist[e]+= 1

In [17]:
bag=dict(fdist.most_common(5000))

In [18]:
len(bag)

5000

In [19]:
wfeat=list(bag.keys())

In [20]:
def find_feat(doc):
    words=set(doc)
    features={}
    for e in wfeat:
        features[e]=(e in words)
    return features    

In [21]:
featuresets=[(find_feat(e),bool(e)) for e in dat.text_processed]
len(featuresets)

20000

In [22]:
training_set = featuresets[:1000]
testing_set = featuresets[1000:]

In [23]:
classifier=nltk.NaiveBayesClassifier.train(training_set)

KeyboardInterrupt: 

In [None]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)