### Environment setup:

In [0]:
!pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 20.6MB/s eta 0:00:01[K     |█████████▌                      | 20kB 1.7MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 2.2MB/s eta 0:00:01[K     |███████████████████             | 40kB 2.5MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 2.3MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.0MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3014341 sha256=8741da15f478a58240f96561399b639fdd2f77ffccf6a13a6bd2cf62dd0d8c20
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle
import csv

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import fasttext
import fasttext.util

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Load and prepare data:

In [0]:
## Setting paths
base_dir  = os.path.join('/content/drive', 'My Drive/ML_Colab')
data_dir = os.path.join(base_dir, 'Data')
fp_data = os.path.join(data_dir, '1.2_data_cleaned.pkl')
fp_zh_model = 'cc.zh.300.bin'
fp_zh_vector = 'cc.zh.300.vec'

In [0]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz
!gunzip cc.zh.300.vec.gz
!gunzip cc.zh.300.bin.gz

--2020-06-15 15:13:05--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1358817100 (1.3G) [binary/octet-stream]
Saving to: ‘cc.zh.300.vec.gz’


2020-06-15 15:14:50 (12.4 MB/s) - ‘cc.zh.300.vec.gz’ saved [1358817100/1358817100]

--2020-06-15 15:14:51--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4478681770 (4.2G) [application/octet-stream]
Saving to: ‘cc.zh.300.bin.gz’


2020-06-15 15:20:02 (13.8 MB/s) - ‘cc.zh.300.bin.gz’ saved [4

In [0]:
rs = 7
tr_sz = 0.8
data_frac = 1.0

In [0]:
## Loading data
with open(fp_data, 'rb') as f:
    data = pickle.load(f)
    print(data.shape)

(2125056, 13)


In [0]:
data = data[['txt_clean', 'star']]
data['words_count'] = data['txt_clean'].apply(lambda x: len(x.split()))

In [0]:
## Remove the short comments (less than 1 words)
print('# Samples with words count less than 1: {}'.format(len(data[(data['words_count'] < 1)])))
data.drop(data[(data['words_count'] < 1)].index, inplace=True)
data.shape

# Samples with words count less than 1: 10089


(2114967, 3)

### Multi-Class classification using `fastText`:

In [0]:
## Randomly select a subset (50%) of data
_data = data.sample(frac=data_frac, random_state=rs)
_data.shape

(2114967, 3)

In [0]:
## labels should be prefixed with `__label__` before feeding to fastText
_data['star_lbl'] = _data['star'].apply(lambda x: '__label__' + str(x))
_data['txt_clean'] = _data['txt_clean'].replace('\n',' ', regex=True).replace('\t',' ', regex=True)

In [0]:
## Train, test and validation split
train, test = train_test_split(_data, test_size=0.3, random_state=rs)

In [0]:
train , valid = train_test_split(train, test_size=0.2, random_state=rs)

In [0]:
valid.shape

(296096, 4)

In [0]:
_train = train[['txt_clean', 'star_lbl']]
_test = test[['txt_clean', 'star_lbl']]
_valid = valid[['txt_clean', 'star_lbl']]  # for later use
print('# of train:', len(train))
print('# of test:', len(test))
print('# of valid:', len(valid))

# of train: 1184380
# of test: 634491
# of valid: 296096


In [0]:
## Setting file paths
train_txt_fp = os.path.join(base_dir, 'train_multi_fasttext.txt')
test_txt_fp = os.path.join(base_dir, 'test_multi_fasttext.txt')
valid_txt_fp = os.path.join(base_dir, 'valid_multi_fasttext.txt')

In [0]:
## Store files as '.txt' for feeding to fastText model
_train.to_csv(train_txt_fp, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
_test.to_csv(test_txt_fp, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
_valid.to_csv(valid_txt_fp, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [0]:
_train.head()

Unnamed: 0,txt_clean,star_lbl
697199,吴亦凡 丑 无 演技 姚晨 真 恶心 林 更新 王丽坤加 一星,__label__3
1437384,很 庞大 阵容 很 严密 逻辑 故事性 很强 一路 无尿点,__label__5
27832,第二天 没有 精神,__label__4
1721733,世界末日 哪有 英雄 最后 不 开枪 结局 开 枪 一种 结局 人人 都 想 苟且 做 英雄 多难,__label__4
195199,没 看过 1 2 仅评 特效,__label__5


In [0]:
## Load the pre-trained fastText model  --- only for test (not applicable in this notebook)
# model = fasttext.load_model(fp_zh_model)

In [0]:
# model.get_nearest_neighbors('砰', k=5)

In [0]:
## Train a classifier model with pre-trained vectors
_model = fasttext.train_supervised(input=train_txt_fp, lr=0.1, loss='ova', dim=300, pretrainedVectors=fp_zh_vector)

In [0]:
_model.get_nearest_neighbors('砰', k=5)

[(0.841676652431488, '嘭'),
 (0.7484003305435181, '咣'),
 (0.7432228922843933, '啪'),
 (0.7091469168663025, '砰——'),
 (0.7020602822303772, '哐')]

In [0]:
## Labels
_model.labels

['__label__4', '__label__5', '__label__3', '__label__1', '__label__2']

In [0]:
## Evaluate the model's performance (multi-class classification)
n, prec, recall = _model.test(test_txt_fp)

print(f'# of test: {n}')
print(f'Precision: {round(prec, 3)}')
print(f'Recall: {round(recall, 3)}')

# of test: 634491
Precision: 0.519
Recall: 0.519


In [0]:
#### Evaluation results: #####

## epoch:5   => (40053, 0.5032082490699823, 0.5032082490699823)
## epoch: 25 => (40053, 0.472973310363768, 0.472973310363768)
## epoch: 50, lr=0.5 => (40053, 0.45854243127855593, 0.45854243127855593)
## lr=1.0, epoch=25, wordNgrams=2 => (48063, 0.4698000540956661, 0.4698000540956661)
## lr=0.1, loss='ova', dim=300, pretrainedVectors=fp_zh_vector => (150197, 0.5171474796434017, 0.5171474796434017)

In [0]:
test.head()

Unnamed: 0,txt_clean,star,words_count,star_lbl
1653658,超 喜欢 郭采洁,5,3,__label__5
780456,最后 半小时 发力 已经 哭,4,5,__label__4
114668,画面 不错 导演 编剧 不会 讲故事 逻辑 混乱 白白浪费 整个 团队 努力,3,12,__label__3
104970,剧情 有点 难受 画面 抒情 很多,3,6,__label__3
594487,周星驰 有没有 关系 至少 很 好看 很多 装逼 人 片子 骂 一文不值 认为 喜欢 周星驰...,5,22,__label__5


In [0]:
_test_list = test['txt_clean'].to_list()
len(_test_list)

634491

In [0]:
## Make predictions on test set --- fastText model returns a tuple containing predicted labels and their probabilities
y_pred, y_prob = _model.predict(_test_list)

In [0]:
y_pred = np.array(y_pred).ravel()
y_true = np.array(test['star_lbl'].to_list())
y_pred.shape, y_true.shape

((634491,), (634491,))

In [0]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

  __label__1       0.58      0.57      0.57     56997
  __label__2       0.42      0.13      0.20     53181
  __label__3       0.46      0.47      0.47    141543
  __label__4       0.47      0.50      0.49    191753
  __label__5       0.59      0.67      0.63    191017

    accuracy                           0.52    634491
   macro avg       0.50      0.47      0.47    634491
weighted avg       0.51      0.52      0.51    634491



negative neutral positive


In [0]:
## Randomly select a subset () of data
bin_data = data.sample(frac=data_frac, random_state=rs)
bin_data.shape

(2114967, 3)

In [0]:
## Segment star values into bins
bins = [-1, 2, 3, 5]
bin_data['sentiment'] = pd.cut(bin_data['star'], bins=bins, labels=['neg','neut','pos'])

In [0]:
bin_data.sample(n=20)

Unnamed: 0,txt_clean,star,words_count,sentiment
757322,失望,3,1,neut
1829936,擎天柱 爱,5,2,pos
1667,真是 炒鸡 难看,2,3,neg
1276377,蒙语 插曲 之海然 海然 印象 深刻,3,6,neut
1052424,大 晚上 一个 人 哭 稀里哗啦 以为 一部 矫情 狗血 青春片 出乎意料 走心,4,13,pos
1100038,绿巨人 卖 下萌 事实证明 浩克 最 无敌,5,7,pos
799849,讲 其实 不是 爱情 激情,4,5,pos
397902,特别 好看,5,2,pos
1185281,先不说 片子 烂 不烂 一个 很 韩寒 电影 不 说 小 时代 定位 不 四娘 都 说 赚钱...,3,44,neut
727530,周星驰 风格 搞笑片 看 粤语 配音 版有 好处 会 忘 小 鲜肉 演员 本身 粤语 配音 ...,3,29,neut


In [0]:
bin_data['sentiment_lbl'] = bin_data['sentiment'].apply(lambda x: '__label__' + str(x))
bin_data['txt_clean'] = bin_data['txt_clean'].replace('\n',' ', regex=True).replace('\t',' ', regex=True)

In [0]:
train, _h = train_test_split(bin_data, train_size=0.7, random_state=rs)
test, valid = train_test_split(_h, train_size=0.5, random_state=rs)

_train = train[['txt_clean', 'sentiment_lbl']]
_test = test[['txt_clean', 'sentiment_lbl']]
_valid = valid[['txt_clean', 'sentiment_lbl']]
print('# of train:', len(train))
print('# of test:', len(test))
print('# of valid:', len(valid))

# of train: 1480476
# of test: 317245
# of valid: 317246


In [0]:
## Setting file paths
train_txt_bin_fp = os.path.join(base_dir, 'train_bin_fasttext.txt')
test_txt_bin_fp = os.path.join(base_dir, 'test_bin_fasttext.txt')
valid_txt_bin_fp = os.path.join(base_dir, 'valid_bin_fasttext.txt')

In [0]:
## Store files as '.txt' for feeding to fastText model
_train.to_csv(train_txt_bin_fp, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
_test.to_csv(test_txt_bin_fp, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
_valid.to_csv(valid_txt_bin_fp, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [0]:
## Train a classifier model with pre-trained vectors
bin_model = fasttext.train_supervised(input=train_txt_bin_fp, dim=300, pretrainedVectors=fp_zh_vector)

In [0]:
## Labels
bin_model.labels

['__label__pos', '__label__neut', '__label__neg']

In [0]:
## Evaluate the model's performance (binary classification)
n, prec, recall = bin_model.test(test_txt_bin_fp)

print(f'# of test: {n}')
print(f'Precision: {round(prec, 2)}')
print(f'Recall: {round(recall, 2)}')

# of test: 317245
Precision: 0.72
Recall: 0.72


In [0]:
test.head()

Unnamed: 0,txt_clean,star,words_count,sentiment,sentiment_lbl
1697158,豆瓣 口碑 很 个人 很 喜欢,4,6,pos,__label__pos
718823,中立 分 许多 人 觉得 欠 周星驰 一张 电影票 看 完 之后 却说 再也 不相 欠 特效...,3,43,neut,__label__neut
149189,萌上 盾 铁 QAQ,5,4,pos,__label__pos
1390182,特效 本来 女主是 景甜 忍 皇上 出现 完蛋,2,8,neg,__label__neg
147373,比复联 2 好,4,3,pos,__label__pos


In [0]:
_test_list = test['txt_clean'].to_list()
len(_test_list)

317245

In [0]:
## Make predictions on test set --- fastText model returns a tuple containing predicted labels and their probabilities
y_pred, y_prob = bin_model.predict(_test_list)

In [0]:
y_pred = np.array(y_pred).ravel()
y_true = np.array(test['sentiment_lbl'].to_list())
y_pred.shape, y_true.shape

((317245,), (317245,))

In [0]:
print(classification_report(y_true, y_pred))

               precision    recall  f1-score   support

 __label__neg       0.69      0.62      0.65     55174
__label__neut       0.52      0.33      0.40     70698
 __label__pos       0.77      0.90      0.83    191373

     accuracy                           0.72    317245
    macro avg       0.66      0.62      0.63    317245
 weighted avg       0.70      0.72      0.71    317245

