In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sentiment as sentimentinterface
import classify 
import timeit
import numpy as np

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

In [111]:
print("Reading data")
tarfname = "sentiment.tar.gz"
sentiment = sentimentinterface.read_files(tarfname)
vectorizer = TfidfVectorizer(stop_words=['and','a','the','this','that','an','there','here','those','am','it','me','with'],ngram_range=(1, 2),max_df=1000, min_df=5)
training_features = vectorizer.fit_transform(sentiment.train_data)  
dev_features = vectorizer.transform(sentiment.dev_data)
cls = classify.train_classifier(training_features, sentiment.trainy)
classify.evaluate(training_features, sentiment.trainy, cls, 'train')
classify.evaluate(dev_features, sentiment.devy, cls, 'dev')

Reading data
-- train data
sentiment/train.tsv
4582
-- dev data
sentiment/dev.tsv
458
-- transforming data and labels
  Accuracy on train  is: 0.898515931907464
  Accuracy on dev  is: 0.7794759825327511


In [28]:
cls1 = classify.train_classifier(sentiment.trainX, sentiment.trainy)
classify.evaluate(sentiment.trainX, sentiment.trainy, cls1, 'train')
classify.evaluate(sentiment.devX, sentiment.devy, cls1, 'dev')

  Accuracy on train  is: 0.9821038847664775
  Accuracy on dev  is: 0.777292576419214


In [112]:
def read_unlabeled(tarfname, sentiment,vectorizer):
    """Reads the unlabeled data.

    The returned object contains three fields that represent the unlabeled data.

    data: documents, represented as sequence of words
    fnames: list of filenames, one for each document
    X: bag of word vector for each document, using the sentiment.vectorizer
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    class Data: pass
    unlabeled = Data()
    unlabeled.data = []
    
    unlabeledname = "unlabeled.tsv"
    for member in tar.getmembers():
        if 'unlabeled.tsv' in member.name:
            unlabeledname = member.name
            
    print(unlabeledname)
    tf = tar.extractfile(unlabeledname)
    for line in tf:
        line = line.decode("utf-8")
        text = line.strip()
        unlabeled.data.append(text)        
    unlabeled.X = vectorizer.transform(unlabeled.data)
    print(unlabeled.X.shape)
    tar.close()
    return unlabeled
unlabeled = read_unlabeled(tarfname, sentiment,vectorizer)
#sentimentinterface.write_pred_kaggle_file(unlabeled, cls_quarter, "trial", sentiment)

sentiment/unlabeled.tsv
(91524, 4478)


In [101]:
len(dev_data)

458

In [113]:
cls_quarter=cls

In [114]:
unlabeled_pred=cls_quarter.predict(unlabeled.X)

In [115]:
prob=cls_quarter.predict_proba(unlabeled.X)
p=np.where(prob[:,0] > 0.95)
q=np.where(prob[:,0] < 0.5)
r=np.concatenate((p,q),axis=None)

In [116]:
new_unlab=np.array(unlabeled.data)
new_un_pred=np.array(unlabeled_pred)
train_new=np.array(list(sentiment.train_data)+list(new_unlab[r]))
trainy_new=np.array(list(sentiment.trainy)+list(new_un_pred[r]))
training_features_q = vectorizer.fit_transform(train_new)  
dev_features = vectorizer.transform(sentiment.dev_data)
cls_quarter=classify.train_classifier(training_features_q, trainy_new)
classify.evaluate(dev_features, sentiment.devy, cls_quarter, 'dev')

  Accuracy on dev  is: 0.5611353711790393


In [11]:
coefficients=cls_quarter.coef_[0]
k = 8
top_k =np.argsort(coefficients)[-k:]
top_k_words = []

print('-'*50)
print('Top k=%d' %k)
print('-'*50)

for i in top_k:
    print(vectorizer.get_feature_names()[i])
    top_k_words.append(vectorizer.get_feature_names()[i])
#print(sentiment.count_ve
print('-'*50)
print('Bottom k=%d' %k)
print('-'*50)
#top_k = np.argpartition(coefficients, -k)[-k:]
bottom_k =np.argsort(coefficients)[:k]
bottom_k_words = []
#print(top_k)
for i in bottom_k:
    print(vectorizer.get_feature_names()[i])
    bottom_k_words.append(vectorizer.get_feature_names()[i])

--------------------------------------------------
Top k=8
--------------------------------------------------
friendly
awesome
excellent
delicious
amazing
love
best
great
--------------------------------------------------
Bottom k=8
--------------------------------------------------
not
worst
horrible
rude
terrible
bad
went
star


In [8]:
import multiprocessing
from gensim.models import Word2Vec
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=3,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [15]:
train_data=[]
import string
punc = string.punctuation
for i in sentiment.train_data:
    thestring = i
    s = list(thestring)
    train_data.append(''.join([o for o in s if not o in punc]).lower().split())
dev_data=[]
for i in sentiment.dev_data:
    thestring = i
    s = list(thestring)
    dev_data.append(''.join([o for o in s if not o in punc]).lower().split())
test_data=[]
for i in unlabeled.data:
    thestring = i
    s = list(thestring)
    test_data.append(''.join([o for o in s if not o in punc]).lower().split())
total_data= train_data+dev_data+test_data 
 


In [16]:
w2v_model.build_vocab(total_data, progress_per=10000)
w2v_model.train(train_data+dev_data+test_data, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)

(100020871, 272021500)

In [81]:
w2v_model.wv.most_similar(positive=["good"])

[('great', 0.7881866097450256),
 ('decent', 0.6825694441795349),
 ('but', 0.6501556634902954),
 ('delicious', 0.6254854798316956),
 ('excellent', 0.6209093332290649),
 ('really', 0.6112504005432129),
 ('food', 0.6086443662643433),
 ('tasty', 0.6044114828109741),
 ('nice', 0.5849175453186035),
 ('amazing', 0.582426905632019)]

In [17]:
import pickle
pickle.dump(w2v_model, open('word2vec', 'wb'))

In [None]:
loaded_model = pickle.load(open('word2vec', 'rb'))

In [19]:
train_wv=[]
for i in train_data:
    temp=0;count=0
    for j in i:
        if j in w2v_model.wv.vocab and len(i)!=0:
            temp+=w2v_model[j]
            count+=1
    if len(i)!=0:
        train_wv.append((temp/count))


  


In [24]:
cls_wv = classify.train_classifier(train_wv,list(sentiment.trainy[0:3107])+list(sentiment.trainy[3108:]))
classify.evaluate(train_wv,list(sentiment.trainy[0:3107])+list(sentiment.trainy[3108:]), cls_wv, 'train')
classify.evaluate(dev_wv, sentiment.devy, cls_wv, 'dev')

  Accuracy on train  is: 0.8227461253001528
  Accuracy on dev  is: 0.8013100436681223


In [23]:
dev_wv=[]
for i in dev_data:
    temp=0;count=0
    for j in i:
        if j in w2v_model.wv.vocab:
            temp+=w2v_model[j]
            count+=1
    dev_wv.append((temp/count))
    

  


In [59]:
test_wv=[]
for i in test_data:
    temp=0;count=0
    for j in i:
        if j in w2v_model.wv.vocab and len(i)!=0:
            temp+=w2v_model[j]
            count+=1
    if count!=0:
        test_wv.append((temp/count))


  


In [63]:
def write_pred_kaggle_file(unlabeled, cls, outfname, sentiment):
    """Writes the predictions in Kaggle format.

    Given the unlabeled object, classifier, outputfilename, and the sentiment object,
    this function write sthe predictions of the classifier on the unlabeled data and
    writes it to the outputfilename. The sentiment object is required to ensure
    consistent label names.
    """
    yp = cls.predict(unlabeled)
    labels = sentiment.le.inverse_transform(yp)
    f = open(outfname, 'w')
    f.write("ID,LABEL\n")
    for i in range(len(unlabeled)):
        f.write(str(i+1))
        f.write(",")
        f.write(labels[i])
        f.write("\n")
    f.close()
write_pred_kaggle_file(test_wv, cls_wv, "sup.csv", sentiment)

In [55]:
c=[]
for i in range(len(test_wv)):
    if type(test_wv[i])!=np.ndarray or len(test_wv[i])!=300 or type(test_wv[i])==int:
        c.append(i)

In [118]:
x=list(cls_wv.predict(test_wv))
for i in range(len(c)):
    x.insert(c[i], 0)

In [66]:
import pandas as pd
v=pd.read_csv('sentiment-pred.csv')


In [144]:
y=[]
for i in x:
    if i==1:
        y.append('POSITIVE')
    else:
        y.append('NEGATIVE')

In [145]:
v['LABEL']=y

In [146]:
v.to_csv('word2vec',index=False)