In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from string import punctuation as en_punc
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import warnings
warnings.filterwarnings('ignore')

In [11]:
with open('aux/stop-words.txt') as f:
    STOP_WORDS = [l.strip() for l in f]

In [15]:
irr = pd.read_csv('data/clean_wb/irr.csv', sep='\t', usecols=['wb_title','wb_body'])
ele = pd.read_csv('data/clean_wb/ele.csv', sep='\t', usecols=['wb_title','wb_body'])
miz = pd.read_csv('data/clean_wb/miz.csv', sep='\t', usecols=['wb_title','wb_body'])
voi = pd.read_csv('data/clean_wb/voi.csv', sep='\t', usecols=['wb_title','wb_body'])
dvb = pd.read_csv('data/clean_wb/dvb.csv', sep='\t', usecols=['wb_title','wb_body'])

irr['site'] = 'irr'
ele['site'] = 'ele'
miz['site'] = 'miz'
voi['site'] = 'voi'
dvb['site'] = 'dvb'

df = pd.concat([irr,ele,miz,voi,dvb])
df['wb_title'] = [
    str(text).translate(str.maketrans('', '', en_punc)) for text in df.wb_title]
df['wb_body'] = [
    str(text).translate(str.maketrans('', '', en_punc)) for text in df.wb_body]
print(df.shape)
display(df.head())

(250, 3)


Unnamed: 0,wb_title,wb_body,site
0,တရုတ် သမ္မတ ခရီးစဉ် က သမုဒ္ဒရာ ၂ စင်း သေနင်္ဂဗ...,တရုတ် သမ္မတ ရှီ ကျင့် ဖျင် သည် သမ္မတ ဦး ဝင်း မ...,irr
1,တရုတ် သမ္မတ ရဲ့ ခရီး ရှည် ချီတက် ပွဲ သစ်,၁၉၃၄ မှ ၁၉၃၆ အတွင်း ခရီး ရှည် ချီတက် ပွဲ စတင် ...,irr
2,ရက္ခိုင် ပြည် သူ့ အာဏာပိုင်အဖွဲ့ ကို ထူထောင် တ...,အစိုးရ နှင့် မြောက် ပိုင်း မဟာမိတ် တပ်ဖွဲ့ တွေ...,irr
3,အစိုးရ နှင့် မြောက် ပိုင်း မဟာမိတ် ဆွေးနွေးပွဲ...,အစိုးရ ငြိမ်းချမ်းရေး ကိုယ်စားလှယ် များ နှင့် ...,irr
4,ရဲဘော်သုံးကျိပ် ဝင် မိသားစု များ ၏ မိတ်ဆုံ စား...,၃၂ ကြိမ် မြောက် ရဲဘော်သုံးကျိပ် မိသားစု သာရေးန...,irr


## Simple Statistical Classifier

In [16]:
title_train, title_test, body_train, body_test, site_train, site_test = train_test_split(
    df.wb_title.values, df.wb_body.values, df.site.values, test_size=0.2)

In [20]:
title_vectorizer = TfidfVectorizer(tokenizer=lambda x:x.split(), stop_words=STOP_WORDS)
title_train_vec = title_vectorizer.fit_transform(title_train)
title_test_vec = title_vectorizer.transform(title_test)

body_vectorizer = TfidfVectorizer(tokenizer=lambda x:x.split(), stop_words=STOP_WORDS)
body_train_vec = body_vectorizer.fit_transform(body_train)
body_test_vec = body_vectorizer.transform(body_test)

In [21]:
from sklearn.linear_model import LogisticRegression

In [27]:
lr = LogisticRegression()
lr.fit(title_train_vec, site_train)
print('Accuracy using article titles:', lr.score(title_test_vec, site_test))

lr = LogisticRegression()
lr.fit(body_train_vec, site_train)
print('Accuracy using article bodies:', lr.score(body_test_vec, site_test))

Accuracy using article titles: 0.52
Accuracy using article bodies: 0.6


## Simple NN Classifier

In [35]:
onehotenc = OneHotEncoder()
onehot_site_train = onehotenc.fit_transform(site_train.reshape(-1, 1))
onehot_site_test = onehotenc.transform(site_test.reshape(-1,1))

### NN using Article Titles

In [65]:
input_dim = title_train_vec.shape[1]  # Number of features

model = Sequential()
# Simple perceptron
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(5, activation='sigmoid'))

model.compile(loss=tf.losses.categorical_crossentropy, 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 10)                11250     
_________________________________________________________________
dense_15 (Dense)             (None, 5)                 55        
Total params: 11,305
Trainable params: 11,305
Non-trainable params: 0
_________________________________________________________________


In [66]:
history = model.fit(title_train_vec, onehot_site_train,
                    epochs=10,
                    verbose=True,
                    batch_size=10)

Train on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [67]:
loss, accuracy = model.evaluate(title_train_vec, onehot_site_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(title_test_vec, onehot_site_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9700
Testing Accuracy:  0.5000


### NN using Article Bodies

In [68]:
input_dim = body_train_vec.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(5, activation='sigmoid'))

model.compile(loss=tf.losses.categorical_crossentropy, 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 10)                83690     
_________________________________________________________________
dense_17 (Dense)             (None, 5)                 55        
Total params: 83,745
Trainable params: 83,745
Non-trainable params: 0
_________________________________________________________________


In [69]:
history = model.fit(body_train_vec, onehot_site_train,
                    epochs=10,
                    verbose=True,
                    batch_size=10)

Train on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [70]:
loss, accuracy = model.evaluate(body_train_vec, onehot_site_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(body_test_vec, onehot_site_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9950
Testing Accuracy:  0.3000
