In [1]:
import numpy as np
import pandas as pd

from string import punctuation

from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import scale
from tqdm import tqdm

from sklearn.cluster import KMeans

In [2]:
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))
tqdm.pandas(desc="progress-bar")

In [3]:
data_root = './'

### Data prepare & process

In [4]:
with open('./text.txt', 'r') as f:
    content = f.readlines()
content = [sent[:-1] for sent in content if sent != '\n']

df = pd.DataFrame({'Contents': content})

In [5]:
df.head()

Unnamed: 0,Contents
0,The work of Dr. He Jiankui presented at GeneEd...
1,The Chinese scientist He Jiankui claims he hel...
2,This is the future. Hard to accept but this wi...
3,They can’t represent Chinese scientist. The wh...
4,He stopped twins being born HIV positive. I do...


In [6]:

def tokenize(sentence):
    tokens = [x.lower() for x in tokenizer.tokenize(sentence)]
#     tokens = [x for x in tokens if x not in stop_words]
    tokens = [x for x in tokens if x not in punctuation]  # punctuation may be useful, to simplify this problem, I don't use it here
    if tokens:
        return tokens
    else:
        return np.NaN

In [7]:
df['tokens'] = df['Contents'].map(tokenize)

In [8]:
def labelize_tweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = f'{label_type}_{i}'
        labelized.append(TaggedDocument(v, [label]))
    return labelized

In [9]:
train_x = np.array(df.tokens)

In [10]:
w2v = Word2Vec(size=200, window=10, min_count=2, workers=10)
w2v.build_vocab([x for x in train_x])
x = [x for x in train_x]
w2v.train(x, total_examples=len(x), epochs=10)

(13651, 29150)

In [11]:
len(w2v.wv.vocab.keys())

341

In [12]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in train_x])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
len(tfidf)

vocab size : 40


In [13]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v[word].reshape((1, size)) * tfidf.get(word, 1)
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [14]:
train_vecs = np.concatenate([buildWordVector(z, 200) for z in train_x])
train_vecs = scale(train_vecs)

  


In [15]:
clt = KMeans(n_clusters=2, random_state=0)
labels = clt.fit_predict(train_vecs)

In [16]:
df['Result'] = labels
df = df.reset_index().rename(columns={'index':'ID'})

df[['ID', 'Contents', 'Result']].to_csv('./Q5_output.csv', index=False)

In [41]:
df

Unnamed: 0,ID,ID.1,Contents,tokens,Result
0,0,0,The work of Dr. He Jiankui presented at GeneEd...,"[The, work, of, Dr, He, Jiankui, presented, at...",0
1,1,1,The Chinese scientist He Jiankui claims he hel...,"[The, Chinese, scientist, He, Jiankui, claims,...",1
2,2,2,This is the future. Hard to accept but this wi...,"[This, is, the, future, Hard, to, accept, but,...",0
3,3,3,They can’t represent Chinese scientist. The wh...,"[They, can, ’, t, represent, Chinese, scientis...",1
4,4,4,He stopped twins being born HIV positive. I do...,"[He, stopped, twins, being, born, HIV, positiv...",1
5,5,5,If true this is wrong and don’t mess with the ...,"[If, true, this, is, wrong, and, don, ’, t, me...",0
6,6,6,"Sure! If the US does it!, nobody would have an...","[Sure, If, the, US, does, it, nobody, would, h...",0
7,7,7,It is true. Cost per genome and genome sequenc...,"[It, is, true, Cost, per, genome, and, genome,...",0
8,8,8,This sounds good.let's hope this is true .,"[This, sounds, good.let, s, hope, this, is, true]",0
9,9,9,Here's the rub. Normally ethical oversight com...,"[Here's, the, rub, Normally, ethical, oversigh...",1
