In [4]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
from bs4 import BeautifulSoup

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [6]:
# read dataset
train_tsv = pd.read_csv("source/sentiment-analysis-on-movie-reviews/train.tsv", header=0, delimiter="\t", quoting=3)
test_tsv = pd.read_csv("source/sentiment-analysis-on-movie-reviews/test.tsv",header=0, delimiter="\t", quoting=3)

In [9]:
def Phrase2PureText(phrase):
    """
    把评论转换成词序列
    """
    # 去掉HTML标签
    phrase_text = BeautifulSoup(phrase, "html.parser").get_text()
    # 用正则取出纯字母部分
    phrase_text = re.sub("[^a-zA-Z]"," ",phrase_text)
    # 小写化所有词
    cur_tokens = phrase_text.lower().split()
    # 返回
    return cur_tokens
    
# 预处理部分
label = train_tsv["Sentiment"]
train_data = []
for i in tqdm(range(len(train_tsv["Phrase"]))):
    train_data.append(' '.join(Phrase2PureText(train_tsv["Phrase"][i])))
test_data = []
for i in tqdm(range(len(test_tsv["Phrase"]))):
    test_data.append(' '.join(Phrase2PureText(test_tsv["Phrase"][i])))


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
100%|██████████| 156060/156060 [00:22<00:00, 6903.07it/s]
100%|██████████| 66292/66292 [00:09<00:00, 7190.83it/s]


In [84]:
import pickle
with open("dataset.pkl", "wb") as f:
    pickle.dump((train_data,list(label)),f)
with open("testset.pkl", "wb") as f:
    pickle.dump(test_data,f)    

In [86]:
# 特征处理
# dataset:包含了训练集/验证集的数据集
def make_dict(dataset, mode="bag_of_word", n=3,dict_size=1000):
    """
    制作词字典
    这里使用bag-aof-words进行实现
    因此需要先制作所有词的字典
    """
    print("take data into counter...")
    token_vocabs = Counter()
    if mode == "bag_of_word":
        for sentence in tqdm(dataset):
            words = sentence.split(' ')
            token_vocabs.update(words)
    else:
        for sentence in tqdm(dataset):
            words = sentence.split(' ')
            i = 3
            n_grams_list = []
            while(i < len(words)):
                n_grams = ' '.join(words[i-3:i])
                n_grams_list.append(n_grams)
                i+=1
            token_vocabs.update(n_grams_list)        
    
    index2word = {}
    word2index = {}
    print("find all words...")
    for idx, item in enumerate(tqdm(token_vocabs.most_common(dict_size))):
        index2word[idx] = item[0]
        word2index[item[0]] = idx
    return token_vocabs, index2word, word2index

all_data = train_data + test_data
print("data len: ",len(all_data))
token_vocabs1, index2word1, word2index1 = make_dict(all_data,dict_size=10000)


  3%|▎         | 7371/222352 [00:00<00:02, 73707.79it/s]

data len:  222352
take data into counter...


100%|██████████| 222352/222352 [00:01<00:00, 128305.27it/s]
100%|██████████| 10000/10000 [00:00<00:00, 708485.33it/s]

find all words...





In [87]:
# 把train/test转换层index表示
# 用bag_of_word
def bag2word(dataset, word2index, dict_size=10000):
    dataset2idx = []
    for sentence in tqdm(dataset):
        cur_seq = np.zeros([dict_size,])
        for word in sentence.split(" "):
            cur_idx = word2index.setdefault(word,None)
            if cur_idx is not None:
                cur_seq[cur_idx] += 1
        dataset2idx.append(cur_seq)
    return dataset2idx

train_data2 = bag2word(train_data, word2index1)
test_data2 = bag2word(test_data, word2index1)

100%|██████████| 156060/156060 [00:18<00:00, 8384.47it/s] 
100%|██████████| 66292/66292 [00:07<00:00, 8490.43it/s] 


In [88]:
X_train, X_test, y_train, y_test = train_test_split(train_data2, label ,test_size = 0.2,random_state = 8)

In [89]:
batchsize = 100
model = LogisticRegression(penalty='l1')
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [69]:
model.score(X_test,y_test)

0.5808342945021145