# 数据预处理

In [27]:
# -*- coding: utf-8 -*-
import pyprind
import pandas as pd
import os
from nltk.corpus import stopwords
import re
import numpy as np

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    tokenized = text.split()
    return tokenized

train_data_path = 'data/labeledTrainData.tsv'
df = pd.read_csv(train_data_path, sep="\t")
df["review"] = df["review"].apply(lambda x : tokenizer(x))
df.to_csv('data/movie_data.csv')

# 生成词向量

In [28]:
import pyprind
import gensim.models
import re

inpath = 'data/movie_data.csv'
outpath = 'results/wordVectTrainResult'
pbar = pyprind.ProgBar(100000)
class csvStream(object):
    def __init__(self,path):
        self.path=path
    def __iter__(self):
        with open(self.path, 'r',) as csv:
            next(csv)  # skip header
            for line in csv:
                text = line[4:-3]
                text = re.sub('[\'\"\[\]\d\b]','',text)   
                while (text[0] == ',') or (text[0] == ' '):
                    text = text[1:]
                pbar.update()
                yield text.split(', ')


lineIterator = csvStream(inpath)
model = gensim.models.Word2Vec()
model.build_vocab(lineIterator)
print('vocabulary building finished, start training...')
model.train(lineIterator,total_examples=model.corpus_count,epochs=1)
model.save(outpath)

0% [#######                       ] 100% | ETA: 00:00:14

vocabulary building finished, start training...


0% [###############               ] 100% | ETA: 00:00:12

# 词向量测试

In [60]:
import gensim.models
import numpy as np


def cos_sim(vector_a, vector_b):
    """
    计算两个向量之间的余弦相似度
    :param vector_a: 向量 a 
    :param vector_b: 向量 b
    :return: sim
    """
    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim


inpath = 'results/wordVectTrainResult'
model = gensim.models.Word2Vec.load(inpath)
test1 = model["good"]
test2 = model["nice"]
test3 = model["go"]


print(len(test1), len(test2),len(test3))
print(cos_sim(test1,test2))

100 100 100
0.9167262516432011




# 分类测试

In [67]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import linear_model

df1 = pd.read_csv("data/sampleSubmission.csv", sep=",")
df2 = pd.read_csv("data/testData.tsv", sep="\t")
df = pd.merge(df1, df2, on="id", how="inner")

inpath = 'results/wordVectTrainResult'
model = gensim.models.Word2Vec.load(inpath)

def get_text_vector(text):
    res = np.zeros([100])
    for item in text:
        if item not in model:
            continue
        res += model[item]
    return res / len(text)

df["review"] = df["review"].apply(lambda x : tokenizer(x))
df["review"] = df["review"].apply(lambda x : get_text_vector(x))
df["len"] = df["review"].apply(lambda x : len(x))

print(df[:10])

  from ipykernel import kernelapp as app


         id  sentiment                                             review  len
0  12311_10          0  [-0.19330275617890003, 0.10081300452282421, 0....  100
1    8348_2          0  [-0.07429290727502805, 0.12601268771738255, 0....  100
2    5828_4          0  [0.11162658395491294, 0.00733450739030409, 0.3...  100
3    7186_2          0  [-0.0369541522263067, 0.16518554098105856, 0.3...  100
4   12128_7          0  [-0.09854314756706696, 0.14072058940038198, 0....  100
5    2913_8          0  [0.034660419103091776, 0.05697532973075008, 0....  100
6    4396_1          0  [0.17724977271983355, 0.12278373242634547, 0.4...  100
7     395_2          0  [0.1945963272358248, 0.16014676212107132, 0.41...  100
8   10616_1          0  [0.11036306295208022, -0.05364430498820348, 0....  100
9    9074_9          0  [-0.0893094160754835, -0.01371210671004927, 0....  100


In [89]:
from sklearn.neural_network import MLPClassifier

X, y = df[["review"]].values.ravel(), df["sentiment"].values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
print(X_train.shape)

print(y_train.shape)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,), random_state=1)
clf.fit(X_train, y_train)

clf.predict_proba(X_test[:1])

clf.predict(X_test[:5, :])

clf.score(X_test, y_test)

(18750,)
(18750,)


ValueError: setting an array element with a sequence.

In [88]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,), random_state=1)

X_train = np.array([np.array([ 1.29321224e-01,  5.18960132e-02,  3.79400451e-01, -3.72608754e-01,
       -2.18533632e-01,  2.59320804e-01,  5.39880085e-01,  5.16798938e-01,
       -1.97540062e-01,  9.55656952e-01,  5.49076735e-01,  3.23896985e-01,
       -3.69491834e-01,  1.37926866e-01, -9.34858046e-01,  3.49694259e-01,
        5.02927222e-01,  7.72485569e-02,  2.41516394e-01,  4.35102434e-01,
       -2.33707874e-01, -3.42638469e-01,  5.13357171e-01,  4.08318066e-01,
       -5.63935861e-01,  3.31579943e-01, -1.00997857e-01,  8.32257381e-01,
       -1.03186743e-01, -2.17901886e-01, -3.72759429e-01,  4.76303828e-01,
        1.01170254e+00,  5.34718914e-01, -9.60375156e-02, -4.29679660e-01,
        5.23644536e-01, -4.65647882e-01, -1.49980314e-01, -2.19136089e-02,
        6.33299342e-03,  5.37065999e-01, -4.86698317e-01,  1.02540010e-01,
        1.97397357e-01,  1.53287670e-01, -4.48417012e-01,  6.40119717e-01,
        3.55489010e-01,  6.53285550e-02,  5.25642002e-01, -1.30566096e-03,
        2.51422268e-01,  3.30942424e-01,  1.19990460e-01, -1.04565109e-01,
       -1.92086261e-01, -1.31871328e+00,  2.32135119e-01,  2.22830327e-01,
        4.79117552e-01, -5.23658381e-03,  4.88311444e-01, -1.47033792e-02,
       -2.58355609e-01,  1.07908211e+00,  2.32782062e-01, -3.90683065e-02,
       -3.77066846e-01,  7.92226757e-02, -1.03517982e+00,  4.02089629e-01,
        6.82145484e-02, -2.45045518e-01,  2.72413841e-01, -6.30229306e-01,
       -3.88759992e-01, -8.83834824e-01, -6.02251000e-01, -4.96690610e-01,
       -3.37865016e-01, -3.01193769e-02,  6.50604504e-01, -7.07761998e-02,
        4.53892187e-01,  5.53244393e-01,  3.83158314e-01, -8.42868587e-02,
        6.80161905e-02,  1.01100661e-01, -3.39743594e-01,  2.16301889e-01,
       -5.55453076e-01, -1.46315171e-01,  6.24522818e-01,  5.87731878e-01,
       -1.10606850e-01, -1.29103243e-01, -1.77118074e-01, -3.12824513e-01]),
 np.array([ 0.14950982,  0.03738608,  0.42585563, -0.40345302, -0.23404879,
        0.325457  ,  0.52644093,  0.31441613, -0.08783832,  0.85549325,
        0.59543287,  0.37857117, -0.3135474 ,  0.03890331, -0.80587419,
        0.32922168,  0.64687974,  0.1379085 ,  0.15211546,  0.418793  ,
       -0.14782306, -0.26198192,  0.34435682,  0.51897987, -0.65181933,
        0.35897195, -0.26711759,  0.8401758 , -0.17628074, -0.14719422,
       -0.43270119,  0.56302991,  1.0217013 ,  0.52281307, -0.01557398,
       -0.44794893,  0.64046578, -0.51360712, -0.04466807, -0.08671971,
       -0.06077578,  0.49838649, -0.46617385,  0.02413524,  0.25871847,
        0.1381659 , -0.44741985,  0.64232978,  0.48129942,  0.07564186,
        0.37772529, -0.07793722,  0.28262221,  0.40180358,  0.06860969,
       -0.17198053, -0.1013253 , -1.34171801,  0.24602981,  0.37648783,
        0.51747068, -0.03582477,  0.50254968,  0.12601257, -0.13872963,
        1.0529655 ,  0.21765285, -0.11891374, -0.32816064,  0.03736426,
       -1.01552567,  0.30671988, -0.02039242, -0.1927615 ,  0.27264915,
       -0.43618522, -0.4158235 , -0.8517373 , -0.54522136, -0.58439193,
       -0.31247232, -0.15039204,  0.57947311, -0.12534831,  0.5562728 ,
        0.59012879,  0.33703196, -0.1903191 ,  0.02046448,  0.09091111,
       -0.39141423,  0.1063484 , -0.42469542, -0.12269154,  0.71052247,
        0.65662416, -0.22396823, -0.09404335, -0.12750437, -0.21620624])])
y_train = [1,0]

print(X_train.shape)


clf.fit(X_train, y_train)

(2, 100)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)