[View in Colaboratory](https://colab.research.google.com/github/JozeeLin/competition-project/blob/master/word2vec-nlp-tutorial/kaggle_word2vec_tutorial_1.ipynb)

In [3]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

import nltk
from nltk.corpus import stopwords

## pandas读入训练数据

In [19]:
df = pd.read_csv('labeledTrainData.tsv', sep='\t', escapechar='\\')
print 'Number of reviews:', len(df)
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


### 对影评数据做预处理，大概有以下环节：

1. 去掉html标签
1. 移除标点
1. 切分成词/token
1. 去掉停用词
1. 重组为新的句子

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
eng_stopwords = set(stopwords.words('english'))
def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]',' ', text)
    words = text.lower().split()
    words = [w for  w in words if w not in eng_stopwords]
    return ' '.join(words)

## 使用前面定义的clean_text函数清洗数据

In [21]:
df['clean_review'] = df.review.apply(clean_text)
df.head()

Unnamed: 0,id,sentiment,review,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin...",classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...


## 抽取bag of words特征(用sklearn的CountVectorizer)

In [22]:
#vectorizer = CountVectorizer(max_features=5000,analyzer='word') #使用analyzer=‘word’降低了准确率，0.55
vectorizer = CountVectorizer(max_features=5000) # 0.84
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
train_data_features.shape

(25000, 5000)

## 训练分类器

In [23]:
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features, df.sentiment)

In [25]:
## 看看模型的准确率
train_data_pred = forest.predict(train_data_features)
print sum(train_data_pred!=df.sentiment) / len(df)

AttributeError: ignored

In [12]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

## 删除不用的占内容变量

In [0]:
del df
del train_data_features

## 读取测试数据进行预测

In [24]:
df = pd.read_csv('testData.tsv', sep='\t', escapechar='\\')
print 'Number of reviews:',len(df)
df['clean_review'] = df.review.apply(clean_text)
df.head()

Number of reviews: 25000


Unnamed: 0,id,review,clean_review
0,12311_10,Naturally in a film who's main themes are of m...,naturally film main themes mortality nostalgia...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster within disaster film full great...
2,5828_4,"All in all, this is a movie for kids. We saw i...",movie kids saw tonight child loved one point k...
3,7186_2,Afraid of the Dark left me with the impression...,afraid dark left impression several different ...
4,12128_7,A very accurate depiction of small time mob li...,accurate depiction small time mob life filmed ...


In [26]:
test_data_features = vectorizer.transform(df.clean_review).toarray()
test_data_features.shape

(25000, 5000)

In [27]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [28]:
output.to_csv('bag_of_words_model.csv',index=False)

In [29]:
drive_service = build('drive', 'v3')
file_metadata = {
  'name': 'bag_of_words_model.csv',
  'mimeType': 'text/plain'
}
media = MediaFileUpload('bag_of_words_model.csv', 
                        mimetype='text/plain',
                        resumable=True)
created = drive_service.files().create(body=file_metadata,
                                       media_body=media,
                                       fields='id').execute()