In [69]:
# import standard libraries
import pandas as pd
import numpy as np

# set pandas text output to 400
pd.options.display.max_colwidth = 400

# import spacy for NLP and re for regular expressions
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()

# import sklearn transformers, models and pipelines
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split

# word2vec
from gensim.models import Word2Vec

# Load the small language model from spacy
nlp = spacy.load('en_core_web_sm')

# set pandas text output to 400
pd.options.display.max_colwidth = 400

print("numpy version: {}".format(np.__version__))
print("pandas version: {}".format(pd.__version__))
print("spacy version: {}".format(spacy.__version__))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hryang06/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


numpy version: 1.18.4
pandas version: 1.0.3
spacy version: 2.2.4


In [70]:
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))

import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from keras.optimizers import Adam

In [71]:
#train = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/nlp/train.csv')
#test = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/nlp/test.csv')
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

train_without_label = train.drop('target', axis=1)
traintest = pd.concat([train_without_label,test])

print("[shape] train, test, train + test\n")
print(train.shape, test.shape, traintest.shape)

[shape] train, test, train + test

(7613, 5) (3263, 4) (10876, 4)


In [72]:
# Load the en_core_web_lg model
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser", "ner"])

# create train set by getting the document vector
docs_train = [nlp(doc).vector for doc in train.text]
X_train = np.vstack(docs_train)
print('Shape of train set: {}'.format(X_train.shape))

# create test set likewise
docs_test = [nlp(doc).vector for doc in test.text]
X_test = np.vstack(docs_test)
print('Shape of test set: {}'.format(X_test.shape))

# create target
y_train = train.target.copy()

Shape of train set: (7613, 300)
Shape of test set: (3263, 300)


### Data cleaning

In [73]:
# https://www.kaggle.com/shahules/basic-eda-cleaning-and-glove
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

import string
def remove_punc(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [74]:
# https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt
slang_abbrev_dict = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'B4N': 'Bye For Now',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': 'For What It\'s Worth',
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you',
    'ILU': 'I Love You',
    'IMHO': 'In My Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My Ass Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'OMG': 'Oh My God',
    'PITA': 'Pain In The Ass',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My Ass Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The Fuck',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait',
    '7K': 'Sick:-D Laugher'
}

def unslang(text):
    if text.upper() in slang_abbrev_dict.keys():
        return slang_abbrev_dict[text.upper()]
    else:
        return text

In [75]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

def stemming(text):
    text = [stemmer.stem(word) for word in text]
    return text

In [76]:
for datas in [train, test]:
    datas['cleaned_text'] = datas['text'].apply(lambda x : remove_url(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_html(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_emoji(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : unslang(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_punc(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : tokenization(x.lower()))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_stopwords(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : stemming(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : ' '.join(x))

In [77]:
train.head(10)

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,resid ask shelter place notifi offic evacu shelter place order expect
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,13000 peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo rubi alaska smoke wildfir pour school
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1,rockyfir updat california hwi 20 close direct due lake counti fire cafir wildfir
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1,flood disast heavi rain caus flash flood street manit colorado spring area
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1,im top hill see fire wood
8,14,,,There's an emergency evacuation happening now in the building across the street,1,there emerg evacu happen build across street
9,15,,,I'm afraid that the tornado is coming to our area...,1,im afraid tornado come area


## modeling

<https://www.kaggle.com/rkritika1508/word2vec>

In [78]:
documents = []
for sentence in train['cleaned_text']:
    text = sentence.split()
    documents.append(text)

documents

[['deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'us'],
 ['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada'],
 ['resid',
  'ask',
  'shelter',
  'place',
  'notifi',
  'offic',
  'evacu',
  'shelter',
  'place',
  'order',
  'expect'],
 ['13000', 'peopl', 'receiv', 'wildfir', 'evacu', 'order', 'california'],
 ['got',
  'sent',
  'photo',
  'rubi',
  'alaska',
  'smoke',
  'wildfir',
  'pour',
  'school'],
 ['rockyfir',
  'updat',
  'california',
  'hwi',
  '20',
  'close',
  'direct',
  'due',
  'lake',
  'counti',
  'fire',
  'cafir',
  'wildfir'],
 ['flood',
  'disast',
  'heavi',
  'rain',
  'caus',
  'flash',
  'flood',
  'street',
  'manit',
  'colorado',
  'spring',
  'area'],
 ['im', 'top', 'hill', 'see', 'fire', 'wood'],
 ['there', 'emerg', 'evacu', 'happen', 'build', 'across', 'street'],
 ['im', 'afraid', 'tornado', 'come', 'area'],
 ['three', 'peopl', 'die', 'heat', 'wave', 'far'],
 ['haha',
  'south',
  'tampa',
  'get',
  'flood',
  'hah',
  'wait',
  'seco

Word2Vec()

- size = 워드 벡터의 특징 값. 즉, 임베딩 된 벡터의 차원.
- window = 컨텍스트 윈도우 크기
- min_count = 단어 최소 빈도 수 제한 (빈도가 적은 단어들은 학습하지 않는다.)
- workers = 학습을 위한 프로세스 수
- sg = 0은 CBOW, 1은 Skip-gram.

In [79]:
word2vec_model = Word2Vec(documents, size=100, window=10, min_count=2, workers=10, iter=10)

In [80]:
vectors = word2vec_model.wv
words = list(word2vec_model.wv.vocab)
num_words = len(word2vec_model.wv.vocab)

In [81]:
#vectors.most_similar('kill')
print(word2vec_model['kill'])

[-0.89875895  0.3619058   0.7632139  -0.12180634  0.1373595  -0.5115519
  0.6422087   0.6154926   0.6584172  -0.3389871  -0.23538919 -0.05384683
 -0.33430207  0.6104109   0.6757611   0.11342448  0.03194258  0.36466685
 -0.2977604  -0.17377649  0.8589057   0.08835721 -1.6103915   0.58986276
  0.17740929  0.7396982  -0.31754965 -0.600357   -0.4210207   0.12666042
 -0.24878675 -0.20860846 -0.44377398  0.34511948 -0.41462818 -0.52121073
 -1.1308595  -0.69503933  0.33199975 -0.00338147  0.17603628  0.7914963
 -0.13082777 -0.06586883 -1.1340004   1.536989    0.24375178 -0.0726503
  0.3618918  -0.05934199  0.6216598  -0.08672447  0.6084365  -0.20720194
  0.07981265 -0.05641282 -0.36378157  0.81986207  0.7081325  -0.29049912
 -0.22088763 -0.25462237 -0.28931385  0.49578035  1.5773712   0.43925527
  0.33251205 -0.36125833  0.20539331 -0.6612035   0.2751709  -0.76621395
 -0.38055307 -0.26190227 -0.07925801  0.75998455 -0.34223956 -0.62076664
  0.07032577 -0.03558413 -1.0906416   0.07342599  0.69

  print(word2vec_model['kill'])


In [82]:
MAX_LEN = 100
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(words)
sequences = tokenizer_obj.texts_to_sequences(words)

tweet_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [83]:
word_index = tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 5506


In [84]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec = word2vec_model[word]
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

  emb_vec = word2vec_model[word]
100%|██████████| 5506/5506 [00:00<00:00, 243631.58it/s]


In [85]:
model = Sequential()

embedding = Embedding(num_words,100, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_LEN, trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [86]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          550700    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 100, 100)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 593,005
Trainable params: 42,305
Non-trainable params: 550,700
_________________________________________________________________


In [87]:
print('Shape of train',train.shape)

X = tweet_pad[:train.shape[0]]
y = train['target'].values
#test = tweet_pad[train.shape[0]:]

print('Shape of train',X.shape)
print("Shape of Validation ",y.shape)

#X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.15)
#print('Shape of train',X_train.shape)
#print("Shape of Validation ",X_test.shape)

Shape of train (7613, 6)
Shape of train (5506, 100)
Shape of Validation  (7613,)


In [None]:
history=model.fit(X_train, y_train, batch_size=4, epochs=15, validation_data=(X_test,y_test),verbose=2)

I create a word embedding for each document using Word2Vec. Word2Vec creates a dense representation for each word, such that words appearing in similar contexts have similar vectors. To get an embedding for the entire tweet, the mean of all vectors for the words in the tweet are taken. The assumption now is that similar tweets have similar vectors.

<https://www.kaggle.com/danielbilitewski/word2vec-and-logistic-regression>

In [12]:
# Load the en_core_web_lg model
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser", "ner"])

# create train set by getting the document vector
docs_train = [nlp(doc).vector for doc in train.text]
X_train = np.vstack(docs_train)
print('Shape of train set: {}'.format(X_train.shape))

# create test set likewise
docs_test = [nlp(doc).vector for doc in test.text]
X_test = np.vstack(docs_test)
print('Shape of test set: {}'.format(X_test.shape))

# create target
y_train = train.target.copy()

Shape of train set: (7613, 300)
Shape of test set: (3263, 300)


In [15]:
# create machine learning pipeline
word2vec_pipe = Pipeline([('estimator', LogisticRegression())])

# cross validate
print('F1 score: {:.3f}'.format(np.mean(cross_val_score(word2vec_pipe, X_train, y_train, scoring = 'f1'))))

# fit pipeline
word2vec_pipe.fit(X_train, y_train)

# predict on test set
pred = word2vec_pipe.predict(X_test)

# submit prediction
sample_submission.target = pred
sample_submission.to_csv('word2vec_baseline.csv', index = False)

F1 score: 0.729


In [None]:
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Okt
from gensim.models import word2vec
import time
import numpy as np

category = ['정치', '경제', '사회', 'IT', '과학', '자동차', '부동산',
            '생활', '세계', '의학', '인테리어', '예술', '연예']

filename='namu_test_article.txt'
filename4 = './hub/namu.model'


def make_DF(filename):
    file = codecs.open(filename, 'r', encoding='utf-8')
    text = file.read()

    twitter = Okt()
    word_dic = {}
    lines = text.split('\r\n')
    for line in lines:
        malist = twitter.pos(line)
        for taeso, pumsa in malist:
            if pumsa == 'Noun':
                if not (taeso in word_dic):
                    word_dic[taeso] = 0
                word_dic[taeso] += 1
    print(word_dic)
    keys = sorted(word_dic.items(), key=lambda x: x[1], reverse=True)

    top20_dic = {}
    if len(keys)>20:
        for word, count in keys[:20]:
            top20_dic[word]=count
    else:
        for word, count in keys:
            top20_dic[word]=count
    return top20_dic


print('Model test')
t1 = time.time()
model = word2vec.Word2Vec.load(filename4)
t2 = time.time()
print('model load elapsed=', t2-t1)
top20_dic=make_DF(filename)
for ks in top20_dic.keys():
    print(ks, top20_dic[ks], end=" ,")

# 카테고리별 단어의 유사도
cascores=[]
for ca in category:
    sims = []
    dfs = []
    for ks in top20_dic.keys():
        try:
            v1 = model.similarity(ca, ks)
            sims.append( v1 )
        except KeyError:
            sims.append( 0.0 )
        v2 = top20_dic[ks]
        dfs.append( v2 )
        print(ca, ks, 'similarity=',v1, 'df=',v2)

    sims = np.asarray(sims)
    dfs = np.asarray(dfs)

    # 단어출연 빈도를 가중치로 한 스코어
    val = np.dot(sims, dfs)
    print('wsum=', val)
    sco=val/ np.sum(dfs)
    print('scor=', sco)
    cascores.append(sco)

cascores=np.asarray(cascores)
maxidx = np.argmax(cascores)

# print(category)
# print(cascores, maxidx)

categorydic = {
    cate:scor for cate, scor in zip(category, cascores)
}
pc=sorted(categorydic, key=lambda k:categorydic[k], reverse=True)
print(pc)
print( sorted(cascores, reverse=True) )
print( 'predict=',pc[0],"/",pc[1] )