In [1]:
from urllib.request import urlopen
import pickle
import pandas as pd
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

In [2]:
stemmer = StemmerFactory().create_stemmer()
stop_words = set(urlopen('https://gist.githubusercontent.com/xxMrPHDxx/d7244906ff8159f6e030790162542524/raw/5ce745520a75af453eda44286ea75bf310a4702f/stopword-malay.txt').read().decode('utf-8').split('\n'))

In [3]:
# Create initial dataset
from os import mkdir, listdir
from os.path import isdir, exists

File_Name, Content, Category, Complete_Filename, id, News_length = [[] for _ in range(6)]
category_map = {corpus.replace('-', ' '): i for i, corpus in enumerate(listdir('articles'))}
for corpus in listdir('articles'):
    folder = 'articles/{}'.format(corpus)
    corpus = corpus.replace('-', ' ')
    for filename in (i for i in filter(lambda f: f.split('.')[-1] == 'txt', listdir(folder))):
        File_Name.append(filename)
        filename = '{}/{}'.format(folder, filename)
        try: content = open(filename, 'r', encoding='utf-8').read()
        except UnicodeDecodeError: content = open(filename, 'r').read()
        Content.append(content)
        Category.append(corpus)
        Complete_Filename.append(filename)
        id.append(category_map[corpus])
        News_length.append(len(content))
        
df = pd.DataFrame()
df['File_Name'] = File_Name
df['Content'] = Content
df['Category'] = Category
df['Complete_Filename'] = Complete_Filename
df['id'] = id
df['News_length'] = News_length

if not exists('Pickles'): mkdir('Pickles')
with open('Pickles/initial_dataset.pickle', 'wb') as file: file.write(pickle.dumps(df))
    
df.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length
0,2017-07_302049_bekas-isteri-adi-putra-didakwa-...,SEREMBAN: Bekas isteri kedua pelakon Adi Putra...,berita kes,articles/berita-kes/2017-07_302049_bekas-ister...,0,1069
1,2017-07_303332_sebutan-kes-19-penuntut-upnm-14...,KUALA LUMPUR: Mahkamah Majistret menetapkan 14...,berita kes,articles/berita-kes/2017-07_303332_sebutan-kes...,0,2171
2,2017-07_307248_mat-motor-terkepung-391-saman-d...,KUALA LUMPUR: Sebanyak 391 saman dikeluarkan a...,berita kes,articles/berita-kes/2017-07_307248_mat-motor-t...,0,2080
3,2017-08_310014_guna-vsp-bantu-cegah-jenayah-po...,KUANTAN: Polis Diraja Malaysia (PDRM) mahu leb...,berita kes,articles/berita-kes/2017-08_310014_guna-vsp-ba...,0,1844
4,2017-08_318783_lelaki-nigeria-tak-dapat-rogol-...,SUBANG JAYA: Kehadiran kereta peronda polis (M...,berita kes,articles/berita-kes/2017-08_318783_lelaki-nige...,0,1373


In [4]:
# Special character cleaning
df['Content_Parsed'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed'] = df['Content_Parsed'].str.replace("\n", " ")
df['Content_Parsed'] = df['Content_Parsed'].str.replace("    ", " ")

# Double quotes removal
df['Content_Parsed'] = df['Content_Parsed'].str.replace('"', '')

In [5]:
# Lowercasing all letters
df['Content_Parsed'] = df['Content_Parsed'].str.lower()

In [6]:
# Removing punctuation signs
punctuation_signs = list("?:!.,;")
df['Content_Parsed'] = df['Content_Parsed']

for punct_sign in punctuation_signs:
    df['Content_Parsed'] = df['Content_Parsed'].str.replace(punct_sign, '')

In [7]:
# Stemming/Lemmatization process
df['Content_Parsed'] = df['Content_Parsed'].apply(lambda x: stemmer.stem(x))

KeyboardInterrupt: 

In [10]:
# Removing stop words
for stop_word in stop_words:
    df['Content_Parsed'] = df['Content_Parsed'].str.replace(r'\b{}\b'.format(stop_word), '')

# Replacing consequtive whitespaces more than 1 with a single space
df['Content_Parsed'] = df['Content_Parsed'].apply(lambda x: re.sub(r'\s\s+', ' ', x))

KeyboardInterrupt: 

In [11]:
df.head()

Unnamed: 0,File_Name,Category,Complete_Filename,Content,Category_Code,Content_Parsed
0,2017-07_302049_bekas-isteri-adi-putra-didakwa-...,berita kes,articles/berita-kes/2017-07_302049_bekas-ister...,SEREMBAN: Bekas isteri kedua pelakon Adi Putra...,0,seremban bekas isteri kedua pelakon adi putra ...
1,2017-07_303332_sebutan-kes-19-penuntut-upnm-14...,berita kes,articles/berita-kes/2017-07_303332_sebutan-kes...,KUALA LUMPUR: Mahkamah Majistret menetapkan 14...,0,kuala lumpur mahkamah majistret menetapkan 14 ...
2,2017-07_307248_mat-motor-terkepung-391-saman-d...,berita kes,articles/berita-kes/2017-07_307248_mat-motor-t...,KUALA LUMPUR: Sebanyak 391 saman dikeluarkan a...,0,kuala lumpur sebanyak 391 saman dikeluarkan at...
3,2017-08_310014_guna-vsp-bantu-cegah-jenayah-po...,berita kes,articles/berita-kes/2017-08_310014_guna-vsp-ba...,KUANTAN: Polis Diraja Malaysia (PDRM) mahu leb...,0,kuantan polis diraja malaysia (pdrm) mahu lebi...
4,2017-08_318783_lelaki-nigeria-tak-dapat-rogol-...,berita kes,articles/berita-kes/2017-08_318783_lelaki-nige...,SUBANG JAYA: Kehadiran kereta peronda polis (M...,0,subang jaya kehadiran kereta peronda polis (mp...


In [9]:
list_columns = ["File_Name", "Category", "Complete_Filename", "Content", "id", "Content_Parsed"]
df = df[list_columns]

df = df.rename(columns={'id': 'Category_Code'})

In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Content_Parsed'], 
                                                    df['Category_Code'], 
                                                    test_size=0.15, 
                                                    random_state=8)
X_train, y_train

(7455    tahniah diucapkan kepada kerajaan kerana berja...
 7071    kuala lumpur “tidak semestinya kerap muncul di...
 6388    makkah lebih dua juta umat islam memulakan iba...
 5508    kuala lumpur bursa malaysia ditutup paras para...
 121     kuala lumpur mahkamah tinggi menetapkan pada r...
                               ...                        
 6995    kuala lumpur penyanyi datuk nassier wahab bers...
 2181    kuala lumpur sultan perak sultan nazrin muizzu...
 2033    kuala lumpur muara sungai yang cetek akibat lu...
 9556    sepang “saya tidak kisah digelar ‘mr 3 meters’...
 4547    kuala lumpur memiliki rumah impian pastinya me...
 Name: Content_Parsed, Length: 8137, dtype: object, 7455    30
 7071    20
 6388    15
 5508     9
 121      0
         ..
 6995    20
 2181     1
 2033     1
 9556    40
 4547     6
 Name: Category_Code, Length: 8137, dtype: int64)

In [16]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [18]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

print(len(X_test.index))
features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(8137, 300)
1436
(1436, 300)


In [None]:
from sklearn.feature_selection import chi2
import numpy as np

for Product, category_id in sorted(category_map.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

In [None]:
# X_train
with open('Pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('Pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('Pickles/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('Pickles/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('Pickles/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
# features_train
with open('Pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('Pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('Pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('Pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('Pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)