In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load dataset
data_train = pd.read_csv('./sentiment-analysis-on-movie-reviews/train.tsv/train.tsv', sep='\t')
data_test = pd.read_csv('./sentiment-analysis-on-movie-reviews/test.tsv/test.tsv', sep='\t')
# 导入停词库（对于情感分析没用的词）
stop_words = open('./stop_words.txt',encoding='utf-8').read().splitlines()

In [76]:
# 构建语料库：把所有的phrase合并到一起形成一个语料库
train_phrases = data_train['Phrase']
test_phrases = data_test['Phrase']
phrases = pd.concat([train_phrases, test_phrases])
label = data_train['Sentiment']  # 提取训练集的标签

使用词袋模型把单词转为向量，才可被计算机认识;  
词袋模型基于计数，关注频率，为考虑顺序，忽略了上下文语义信息

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer可将文本中的语句转换为词频矩阵
vectorizer = CountVectorizer(
    analyzer='word',  # 以word的形式进行分析，而非char
    ngram_range=(1, 4),  # ngram的范围
    stop_words=stop_words,
    max_features=160000  # 每个word表示为一个160000维的向量
)

In [78]:
# 使用语料库，构建词袋模型
vectorizer.fit(phrases)

In [79]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(train_phrases, label, test_size=0.25, random_state=42)  # 从训练集中划分训练集和验证集
train_phrases.shape, x_train.shape, x_val.shape

((156060,), (117045,), (39015,))

In [80]:
# 用构建的词袋模型，把训练集和验证集转换成向量
x_train = vectorizer.transform(x_train)
x_val = vectorizer.transform(x_val)

In [81]:
# 使用logistic regression做情感分类
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
print(f'Validation accuracy: {logistic.score(x_val, y_val)}')

Validation accuracy: 0.6424195822119697


In [63]:
test_X = vectorizer.transform(data_test['Phrase'])  # 向量化
prediction = logistic.predict(test_X)  # 分类

In [66]:
# 将结果加到测试集中
data_test.loc[:, 'Sentiment'] = prediction
# 整理提交所需文件格式
submit_data = data_test.loc[:, ['PhraseId', 'Sentiment']]
# 存入文件
submit_data.to_csv('submission.csv', index=False)

In [65]:
submit_data.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3
