[View in Colaboratory](https://colab.research.google.com/github/JozeeLin/competition-project/blob/master/word2vec-nlp-tutorial/kaggle_word2vec_tutorial_2.ipynb)

In [0]:
!pip install -U -q PyDrive
!pip install -U -q gensim
!pip install -U -q Cython # 安装Cython提高Word2Vec的处理性能
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
data_file = drive.CreateFile({'id':'1MMQXKM8roP-wvWxn8cHOkqAXJ7ypkPpZ'}) # replace the id with id of file you want to access
data_file.GetContentFile('testData.tsv')
data_file = drive.CreateFile({'id':'1lxWF-fg3E49S7kP06caNytc0SmDTC5gX'})
data_file.GetContentFile('labeledTrainData.tsv')
data_file = drive.CreateFile({'id':'18_uCGu01SXkjTygQANpH6YCHQdNiwsDV'})
data_file.GetContentFile('unlabeledTrainData.tsv')

In [106]:
!ls

300features_40minwords_10context.model	testData.tsv
datalab					unlabeledTrainData.tsv
labeledTrainData.tsv			Word2Vec_BagOfClusters.csv
neg.xls					Word2Vec_model.csv
nltk_data				word_centroid_map_10avg.pickle
pos.xls					zhwiki-latest-pages-articles.xml.bz2


In [86]:
import re
import pickle
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from gensim.models.word2vec import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
import nltk
import nltk.data
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train':'unlabeledTrainData.tsv',
        'labeled_train':'labeledTrainData.tsv',
        'testData':'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    df = pd.read_csv(datasets[name], sep='\t', escapechar='\\',nrows=nrows)
    return df

## 读入无标签数据
用于训练生成word2vec词向量

In [8]:
df = load_dataset('unlabeled_train')
df.head()

Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


In [0]:
eng_stopwords = set(stopwords.words('english'))
def clean_text(text, remove_stopwords=True):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]',' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

In [0]:
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

In [0]:
def split_sentences(review):
  raw_sentences = tokenizer.tokenize(review.decode('utf-8').strip())
  sentences = [clean_text(s) for s in raw_sentences if s]
  return sentences

In [56]:
sentences = sum(df.review.apply(split_sentences),[])
print('{} reviews -> {} sentences'.format(len(df), len(sentences)))

  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


50000 reviews -> 537851 sentences


## 用gensim训练词嵌入模型

In [0]:
#设定词向量训练的参数
num_features = 300  #词向量长度
min_word_count = 40 #最小词频
num_workers = 4 # 线程数
context = 10 # 上下文窗口大小
downsampling = 1e-3  #负采样 对 高频词进行负采样
model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

In [59]:
print('Training model...')
model = Word2Vec(sentences, workers=num_workers,size=num_features, min_count=min_word_count,\
                          window=context,sample=downsampling)
model.init_sims(replace=True)
model.save(model_name)

Training model...


## 看看训练的词向量结果如何

In [60]:
print(model.doesnt_match('man woman child kitchen'.split()))
print(model.doesnt_match('france england germany berlin'.split()))

kitchen
berlin


  """Entry point for launching an IPython kernel.
  


In [61]:
model.most_similar('man')

  """Entry point for launching an IPython kernel.


[(u'lady', 0.6093131899833679),
 (u'woman', 0.5811398029327393),
 (u'men', 0.5336166620254517),
 (u'mans', 0.45269760489463806),
 (u'lad', 0.4468681216239929),
 (u'chap', 0.44640877842903137),
 (u'peasant', 0.4321844279766083),
 (u'monk', 0.42857974767684937),
 (u'grandson', 0.4267146587371826),
 (u'person', 0.42538565397262573)]

In [62]:
model.most_similar('awful')

  """Entry point for launching an IPython kernel.


[(u'terrible', 0.8140065670013428),
 (u'horrible', 0.7812572717666626),
 (u'dreadful', 0.7603800296783447),
 (u'atrocious', 0.7543813586235046),
 (u'horrid', 0.7384567260742188),
 (u'abysmal', 0.7305564284324646),
 (u'horrendous', 0.7270557880401611),
 (u'appalling', 0.7005763053894043),
 (u'lousy', 0.6944822072982788),
 (u'crappy', 0.6584889888763428)]

## 读入之前训练好的word2vec模型

In [0]:
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(model_name)

## 根据word2vec的结果取对影评文本进行编码
编码方式有一点粗暴,简单来说就是把这句话中的词的词向量做平均

In [64]:
df = load_dataset('labeled_train')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [0]:
def to_review_vector(review):
    words = clean_text(review, remove_stopwords=True)
    array = np.array([model[w] for w in words if w in model]) #只获取之前训练中得到的词对应的词向量
    return pd.Series(array.mean(axis=0)) #句子表征（对这句话中的词的词向量做平均）

In [67]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.002301,0.000195,-0.011667,0.001758,0.008971,-0.015142,-0.004608,-0.008976,-0.004569,-0.011369,...,0.012878,-0.008942,0.00183,0.006384,-0.003532,0.001806,-0.023057,0.008721,-0.023023,-0.004719
1,0.002214,-0.002991,-0.026946,-0.017171,-0.007454,-0.002659,0.011523,0.009194,0.021793,-0.012293,...,0.015273,-0.018626,-0.002221,0.014355,-1.3e-05,0.015347,-0.012987,-0.017007,-0.020576,0.016552
2,-0.021281,-0.014221,-0.024887,-0.01669,-0.007231,0.001583,0.020306,0.010289,0.008628,-0.02523,...,0.029082,0.003547,-0.004067,0.022749,0.008137,0.018943,-0.021524,0.011212,-0.018821,0.004907
3,-0.00906,0.004359,-0.014058,-0.00761,0.01676,-0.006075,0.006964,0.013396,0.00663,-0.010928,...,0.017541,-0.00087,0.001565,0.008802,-0.013177,0.013124,-0.025679,-0.015423,-0.030367,0.009022
4,-0.005941,-0.003862,-0.013713,0.004704,0.011728,-0.00598,0.003089,0.020213,-0.004871,-0.008754,...,0.022762,0.009058,0.000612,0.012036,-0.002984,0.006277,-0.023391,0.014227,-0.024221,-0.002127


## 用随机森林构建分类器

In [0]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

## 看看分类器在训练集上的准确率

In [69]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

In [0]:
#清理占用内容的变量
del df
del train_data_features

## 预测测试集结果并上传Kaggle

In [73]:
df = load_dataset('testData')
df.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [74]:
test_data_features = df.review.apply(to_review_vector)
test_data_features.head()

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.011843,-0.000651,-0.015461,0.003898,-0.001246,9.4e-05,-0.002539,0.029375,0.013902,-0.013791,...,0.030883,-0.009939,0.001893,0.034039,-0.010348,0.008567,-0.057584,-0.018047,-0.044727,0.012082
1,0.012279,0.001367,-0.003965,-0.00403,0.021583,-0.029047,-0.0144,-0.003117,0.006958,0.002174,...,0.014373,-0.016566,-0.001588,0.008155,-0.01891,-0.00676,-0.04282,0.009879,-0.033384,-0.007741
2,0.004804,0.0044,-0.010148,0.010583,0.031862,-0.017914,-0.012768,-0.011764,0.018405,0.000777,...,0.022423,-0.015491,0.008229,0.019291,-0.012224,-0.007576,-0.03702,-0.00471,-0.046705,-0.003077
3,-0.015947,-0.007554,0.001088,0.004873,0.006196,-0.001922,-4.6e-05,0.001228,0.02517,6.2e-05,...,0.024191,-0.004353,-0.001459,0.032023,-0.008207,0.001238,-0.043039,0.002011,-0.051958,-0.004043
4,-0.024728,-0.012909,-0.00875,0.000309,0.013506,0.004938,-0.006832,0.004253,-0.003488,-0.004558,...,0.015922,-0.016665,0.010026,0.012267,0.001648,0.007966,-0.006179,0.000792,-0.012695,0.022062


In [75]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
output.to_csv('Word2Vec_model.csv',index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [0]:
del df
del test_data_features
del forest

## 把测试集的预测结果保存到google driver上

In [0]:
drive_service = build('drive', 'v3')
file_metadata = {
  'name': 'Word2Vec_model.csv',
  'mimeType': 'text/plain'
}
media = MediaFileUpload('Word2Vec_model.csv', 
                        mimetype='text/plain',
                        resumable=True)
created = drive_service.files().create(body=file_metadata,
                                       media_body=media,
                                       fields='id').execute()

# 对词向量进行聚类研究和编码
使用KMeans进行聚类
<br>
训练好的模型是由词汇表中单词的特征向量所组成的。这些特征向量存储在叫做wv.vectors的numpy数组中.

In [81]:
word_vectors = model.wv.vectors
num_clusters = word_vectors.shape[0] // 10
print word_vectors.shape

(12907, 300)


In [0]:
kmeans_clustering = KMeans(n_clusters = num_clusters, n_jobs=-1)
idx = kmeans_clustering.fit_predict(word_vectors)

In [85]:
word_centroid_map = dict(zip(model.wv.index2word, idx))
keys = word_centroid_map.keys()
dict_slice = {}
for k in keys[0:5]:
    print k,word_centroid_map[k]

aided 44
writings 98
galactica 420
foul 1255
four 1197


In [0]:
#通过聚类把每个单词与聚类的索引号映射关系保存起来
filename = 'word_centroid_map_10avg.pickle'
with open(filename,'w') as f:
    pickle.dump(word_centroid_map, f)

## 输出一些cluster看看

In [89]:
for cluster in range(0,10):
    print('\nCluster %d' % cluster)
    print([w for w,c in word_centroid_map.items() if c == cluster])


Cluster 0
[u'method', u'projects', u'skill', u'capable', u'skills', u'enthusiasm', u'talent', u'craft', u'energy', u'abilities', u'ability']

Cluster 1
[u'unemployed', u'insists', u'dumped', u'pose', u'inviting', u'owns', u'humiliated', u'toast', u'tow', u'screws', u'sells', u'buys', u'calling', u'complains']

Cluster 2
[u'hidalgo', u'welcomed', u'scotty', u'footsteps', u'mitch', u'unsuccessful', u'riddle', u'ricco', u'crock', u'breakup', u'lucinda', u'valjean', u'det', u'sleuth', u'individually', u'impatient']

Cluster 3
[u'rough', u'applied', u'limits', u'todays', u'prevalent', u'outdated', u'primitive', u'pc', u'heavily']

Cluster 4
[u'stimulating', u'enthralling', u'unconventional']

Cluster 5
[u'disjointed', u'exposition', u'jumbled', u'trivial', u'monotonous', u'unfocused', u'meandering', u'uneventful', u'turgid', u'drags', u'aimless', u'mundane', u'stretches', u'stretched', u'talky', u'stretching', u'uninvolving', u'unexciting', u'dreary', u'forgetting', u'muddled', u'bogged', 

## 将平均数据转成cluster bag vectors

In [0]:
wordset = set(word_centroid_map.keys())

def make_cluster_bag(review):
    words = clean_text(review, remove_stopwords=True)
    # 使用聚类索引来对句子进行编码
    return (pd.Series([word_centroid_map[w] for w in words if w in wordset])).value_counts().reindex(range(num_clusters+1),fill_value=0)

In [91]:
df = load_dataset('labeled_train')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [92]:
train_data_features = df.review.apply(make_cluster_bag)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1281,1282,1283,1284,1285,1286,1287,1288,1289,1290
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,1,0,0,0,0


## 再用随机模型算法建模

In [0]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

In [94]:
#获取模型在训练集上的准确率
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

In [0]:
#去掉无用的占内存的量
del df
del train_data_features

## 载入测试数据做预测

In [96]:
df = load_dataset('testData')
df.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [97]:
test_data_features = df.review.apply(make_cluster_bag)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1281,1282,1283,1284,1285,1286,1287,1288,1289,1290
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv('Word2Vec_BagOfClusters.csv',index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [0]:
del df
del test_data_features
del forest

In [0]:
drive_service = build('drive', 'v3')
file_metadata = {
  'name': 'Word2Vec_BagOfClusters.csv',
  'mimeType': 'text/plain'
}
media = MediaFileUpload('Word2Vec_BagOfClusters.csv', 
                        mimetype='text/plain',
                        resumable=True)
created = drive_service.files().create(body=file_metadata,
                                       media_body=media,
                                       fields='id').execute()