In [0]:
import pandas as pd
import numpy as np
import re

In [0]:
# read data
data = pd.read_table('C:\\Users\\Administrator\\Desktop\\sentiment.analysis\\train.txt',header=None)
data.columns = ['words','label']

test = pd.read_table('C:\\Users\\Administrator\\Desktop\\sentiment.analysis\\test.txt',header=None)
test.columns = ['id','words']

In [0]:
# clean data and remove stopwords 
def textParse(words):
    text = re.sub("[^a-zA-Z]"," ",words)
    wordsList = text.lower().split()
    return wordsList

In [0]:
label = data['label']
train_data = []
for i in range(len(data['words'])):
    train_data.append(' '.join(textParse(data['words'][i])))
    
test_data = []
for i in range(len(test['words'])):
    test_data.append(' '.join(textParse(test['words'][i])))


In [0]:
# using tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=2,max_features = 10000,ngram_range=(1,2),stop_words = 'english')
data_all = train_data + test_data
data_set = tfidf.fit_transform(data_all)

len_train = len(train_data)#8530
train_x = data_set[:len_train]
test_x = data_set[len_train:]

In [0]:
from sklearn.metrics import f1_score,accuracy_score,precision_score
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_x,label,test_size=0.3,random_state=123)

In [0]:
# train model using SVC','LR','MultinomialNB', 'BernoulliNB','RandomForest
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

def multiTest():
    ensemble=['SVC','LR','MultinomialNB', 'BernoulliNB','RandomForestClassifier']
    for a in ensemble:
        classifierResult = []
        print(a + ':')
        if a == 'SVC':
            clf = LinearSVC()
            param = {'C': [1e15,1e13,1e11,1e9,1e7,1e5,1e3,1e1,1e-1,1e-3,1e-5]}
            clf = GridSearchCV(clf, param, cv=10)
        if a == 'LR':
            clf = LogisticRegression()
        if a == 'MultinomialNB':
            clf == MultinomialNB()
        if a == 'BernoulliNB':
            clf = BernoulliNB()
        if a == 'RandomForestClassifier':
            clf = RandomForestClassifier(n_estimators=160,max_depth=40,min_samples_split=3,max_features=18)
        
        clf.fit(x_train,y_train)
        y_pre = clf.predict(x_test)
                    
        print('accuracy_score：',accuracy_score(y_test,y_pre))
        print('precision_score：',precision_score(y_test,y_pre))
        print('recall_score：',recall_score(y_test,y_pre))
        print('F1：',f1_score(y_test,y_pre))
        
        test_predicted = np.array(clf.predict(test_x))
        output = pd.DataFrame(data=test_predicted, columns=['Predicted'])
        output['id'] = test['id']
        output = output[['id', 'Predicted']]
        output.to_csv('C:\\Users\\Administrator\\Desktop\\sentiment.analysis\\predict_data\\tfidf\\%i_output.csv'%(ensemble.index(a)), index=False)        

multiTest()


In [0]:


import gensim
import nltk
from nltk.corpus import stopwords

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_wordlist(review, remove_stopwords=False ):
    review_text = re.sub("[^a-zA-Z]"," ", review)

    words = review_text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)

def review_to_sentences( review, tokenizer, remove_stopwords=False ):
  
    raw_sentences = tokenizer.tokenize(review.strip())

    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 1:
          
            sentences.append(review_to_wordlist( raw_sentence, remove_stopwords ))
    return sentences



In [0]:
sentences = []
for i, review in enumerate(data["words"]):
    sentences += review_to_sentences(review, tokenizer)

In [0]:
# model parameter 

import time
from gensim.models import Word2Vec
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3 

In [0]:
model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

In [0]:
def makeFeatureVec(words, model, num_features):
  
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0.

   
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])


    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
   
    counter = 0

    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")

    for review in reviews:
        if counter % 1000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))

        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [0]:
trainDataVecs = getAvgFeatureVecs(train_data, model, num_features)
testDataVecs = getAvgFeatureVecs(test_data, model, num_features)

Review 0 of 8530


  del sys.path[0]


Review 1000 of 8530
Review 2000 of 8530
Review 3000 of 8530
Review 4000 of 8530
Review 5000 of 8530
Review 6000 of 8530
Review 7000 of 8530
Review 8000 of 8530


  app.launch_new_instance()


Review 0 of 600


In [0]:
trainDataVecs[np.isnan(trainDataVecs)]=np.mean(trainDataVecs[~np.isnan(trainDataVecs)])

In [0]:
from sklearn.metrics import f1_score,accuracy_score,precision_score
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(trainDataVecs,label,test_size=0.3,random_state=123)

In [0]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def multiTest1():
    ensemble=['SVC','LR','MultinomialNB', 'BernoulliNB','RandomForestClassifier']
    for a in ensemble:
        classifierResult = []
        print(a + ':')
        if a == 'SVC':
            clf = LinearSVC()
        if a == 'LR':
            clf = LogisticRegression()
        if a == 'MultinomialNB':
            clf == MultinomialNB()
        if a == 'BernoulliNB':
            clf = BernoulliNB()
        if a == 'RandomForestClassifier':
            clf = RandomForestClassifier(n_estimators=160,max_depth=40,min_samples_split=3,max_features=18)
        
        clf.fit(x_train,y_train)
        y_pre = clf.predict(x_test)
                    
        print('accuracy_score：',accuracy_score(y_test,y_pre))
        print('precision_score：',precision_score(y_test,y_pre))
        print('recall_score：',recall_score(y_test,y_pre))
        print('F1：',f1_score(y_test,y_pre))
        
        test_predicted = np.array(clf.predict(testDataVecs))
        output = pd.DataFrame(data=test_predicted, columns=['Predicted'])
        output['id'] = test['id']
        output = output[['id', 'Predicted']]
        output.to_csv('C:\\Users\\Administrator\\Desktop\\sentiment.analysis\\predict_data\\word2vec\\%i_output.csv'%(ensemble.index(a)), index=False)        


multiTest1()
