This repository has been archived by the owner on Nov 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 62
/
WordEmbedding.py
38 lines (33 loc) · 1.56 KB
/
WordEmbedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
###############################################################################
# WordEmbedding: pre-trained DNN model
# for text.
from nimbusml import FileDataStream, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram
# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()
data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
# Sentiment SentimentText
# 0 1 ==RUDE== Dude, you are rude upload that carl p...
# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK...
# 2 1 Stop trolling, zapatancas, calling me a liar m...
# 3 1 ==You're cool== You seem like a really cool g...
# 4 1 ::::: Why are you threatening me? I'm not bein...
# transform usage
pipeline = Pipeline([
NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='ngram_TransformedText',
columns={'ngram': ['SentimentText']}),
WordEmbedding(columns='ngram_TransformedText')
])
# fit and transform
features = pipeline.fit_transform(data)
# print features
print(features.head())
# Sentiment ... ngram.douchiest ngram.award.
# 0 1 ... 0.0 0.0
# 1 1 ... 0.0 0.0
# 2 1 ... 0.0 0.0
# 3 1 ... 0.0 0.0
# 4 1 ... 0.0 0.0