# 数据预处理

In [27]:
# -*- coding: utf-8 -*-
import pyprind
import pandas as pd
import os
from nltk.corpus import stopwords
import re
import numpy as np

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    tokenized = text.split()
    return tokenized

train_data_path = 'data/labeledTrainData.tsv'
df = pd.read_csv(train_data_path, sep="\t")
df["review"] = df["review"].apply(lambda x : tokenizer(x))
df.to_csv('data/movie_data.csv')

# 生成词向量

In [28]:
import pyprind
import gensim.models
import re

inpath = 'data/movie_data.csv'
outpath = 'results/wordVectTrainResult'
pbar = pyprind.ProgBar(100000)
class csvStream(object):
    def __init__(self,path):
        self.path=path
    def __iter__(self):
        with open(self.path, 'r',) as csv:
            next(csv)  # skip header
            for line in csv:
                text = line[4:-3]
                text = re.sub('[\'\"\[\]\d\b]','',text)   
                while (text[0] == ',') or (text[0] == ' '):
                    text = text[1:]
                pbar.update()
                yield text.split(', ')


lineIterator = csvStream(inpath)
model = gensim.models.Word2Vec()
model.build_vocab(lineIterator)
print('vocabulary building finished, start training...')
model.train(lineIterator,total_examples=model.corpus_count,epochs=1)
model.save(outpath)

0% [#######                       ] 100% | ETA: 00:00:14

vocabulary building finished, start training...


0% [###############               ] 100% | ETA: 00:00:12

# 词向量测试

In [60]:
import gensim.models
import numpy as np


def cos_sim(vector_a, vector_b):
    """
    计算两个向量之间的余弦相似度
    :param vector_a: 向量 a 
    :param vector_b: 向量 b
    :return: sim
    """
    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim


inpath = 'results/wordVectTrainResult'
model = gensim.models.Word2Vec.load(inpath)
test1 = model["good"]
test2 = model["nice"]
test3 = model["go"]


print(len(test1), len(test2),len(test3))
print(cos_sim(test1,test2))

100 100 100
0.9167262516432011




# 分类测试

In [67]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import linear_model

df1 = pd.read_csv("data/sampleSubmission.csv", sep=",")
df2 = pd.read_csv("data/testData.tsv", sep="\t")
df = pd.merge(df1, df2, on="id", how="inner")

inpath = 'results/wordVectTrainResult'
model = gensim.models.Word2Vec.load(inpath)

def get_text_vector(text):
    res = np.zeros([100])
    for item in text:
        if item not in model:
            continue
        res += model[item]
    return res / len(text)

df["review"] = df["review"].apply(lambda x : tokenizer(x))
df["review"] = df["review"].apply(lambda x : get_text_vector(x))
df["len"] = df["review"].apply(lambda x : len(x))

print(df[:10])

  from ipykernel import kernelapp as app


         id  sentiment                                             review  len
0  12311_10          0  [-0.19330275617890003, 0.10081300452282421, 0....  100
1    8348_2          0  [-0.07429290727502805, 0.12601268771738255, 0....  100
2    5828_4          0  [0.11162658395491294, 0.00733450739030409, 0.3...  100
3    7186_2          0  [-0.0369541522263067, 0.16518554098105856, 0.3...  100
4   12128_7          0  [-0.09854314756706696, 0.14072058940038198, 0....  100
5    2913_8          0  [0.034660419103091776, 0.05697532973075008, 0....  100
6    4396_1          0  [0.17724977271983355, 0.12278373242634547, 0.4...  100
7     395_2          0  [0.1945963272358248, 0.16014676212107132, 0.41...  100
8   10616_1          0  [0.11036306295208022, -0.05364430498820348, 0....  100
9    9074_9          0  [-0.0893094160754835, -0.01371210671004927, 0....  100


In [116]:
from sklearn.neural_network import MLPClassifier

X = []
for item in df[["review"]].values.ravel():
    X.append(item)

y = df["sentiment"].values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,), random_state=1)
clf.fit(X_train, y_train)

clf.predict_proba(X_test[:1])

clf.score(X_test, y_test)

1.0

In [118]:
a=df[["review"]].values.ravel()
a.shape

(25000,)

In [119]:
a.reshape(25000,100)

ValueError: cannot reshape array of size 25000 into shape (25000,100)