In [1]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

import sklearn
import sklearn_crfsuite
import scipy.stats
import math, string, re
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn_crfsuite import CRF
from nltk.tokenize import word_tokenize
from nltk.tag.util import untag

import ast
from ast import literal_eval

import jieba 
from hanziconv import HanziConv

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer
from operator import add

In [2]:
import nltk
from nltk.corpus import words, brown, treebank
nltk.download('treebank')

nltk_treebank = (treebank.tagged_sents())

def penn2ud(text):
    if text in ['#', '$','SYM']:
        return 'E_SYM'
    elif text in [r"''", ',', '-LRB-', '-RRB-', '.', ':', 'HYPH', '``']:
        return 'E_PUNCT'
    elif text in ['AFX', 'JJ', 'JJR', 'JJS']:
        return 'E_ADJ'
    elif text in ['RB', 'RBR', 'RBS']:
        return 'E_ADV'
    elif text == 'CC':
        return 'E_CCONJ'
    elif text in ['DT', 'PDT', 'PRP$', 'WDT', 'WP$']:
        return 'E_DET'
    elif text == 'CD':
        return 'E_NUM'
    elif text in ['EX','PRP', 'WP']:
        return 'E_PRON'
    elif text in ['FW', 'LS', 'NIL']:
        return 'E_X'
    elif text in ['IN', 'RP']:
        return 'E_ADP'
    elif text in ['MD','VB', 'VBD', 'VBG', 'VBN' ,'VBP' ,'VBZ']:
        return 'E_VERB'
    elif text in ['NN', 'NNS']:
        return 'E_NOUN'
    elif text in ['NNP', 'NNPS']:
        return 'E_PROPN'
    elif text in ['POS', 'TO']:
        return 'E_PART'
    elif text == 'UH':
        return 'E_INTJ'
    else:
        return 'E_X'

def replace_E_(text):
    return re.sub(r"E_", "", text)


[nltk_data] Downloading package treebank to /home/pc8/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [3]:
nltk_treebank

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

In [4]:
full_treebank = []
for sentence in nltk_treebank:
    sentence_list = [] 
    for word in sentence:
        word_list = list(word)
        tag = penn2ud(word_list[1])
        tag = replace_E_(tag)
        word_list[1] = tag
        word_tuple = tuple(word_list)
        sentence_list.append(word_tuple)
    full_treebank.append(sentence_list)


In [5]:
with open('../Corpus/English/treebank_Mapped.txt', 'w') as f:
    for line in full_treebank:
        f.write(str(line))
        f.write('\n')

In [6]:
print(full_treebank[:2])

[[('Pierre', 'PROPN'), ('Vinken', 'PROPN'), (',', 'PUNCT'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', 'PUNCT'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'PROPN'), ('29', 'NUM'), ('.', 'PUNCT')], [('Mr.', 'PROPN'), ('Vinken', 'PROPN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'PROPN'), ('N.V.', 'PROPN'), (',', 'PUNCT'), ('the', 'DET'), ('Dutch', 'PROPN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', 'PUNCT')]]


In [7]:
f = open("../Corpus/English/Brown_Mapped/part-00000", "r")

brown_word_list = []
for l in f:
    brown_word_list.append(l)
f.close()

In [8]:
df_Alldata = pd.read_csv('../Data/Sample_Data/Sample_All.csv')
df_training = pd.read_csv(r'manuallyTagging.csv')

In [9]:
duplicates = pd.merge(df_Alldata, df_training, how='inner',
                  left_on=['Sentence'], right_on=['f'])

# drop the indices from USERS
df_Alldata = df_Alldata.drop(duplicates.index)

In [10]:
df_Alldata = df_Alldata.drop(['Unnamed: 0', 'Key', 'token_words'], axis = 1)
len(df_Alldata)

178300

In [11]:
# df = pd.read_csv(r'manuallyTagging.csv')
# df = df.dropna()

df = pd.read_csv(r'testing posTAG.csv')
df = df.dropna()

def remove_newline(text):
    return re.sub(r'\n', ' ', str(text))

def find_astrophe(text):
    return bool(re.findall(r'\'', text))

def convert_string2_list(text):

    text = ast.literal_eval(str(text))
    return text

def unicode_problem(text):
    return re.sub(r'[\u0080]','',text).strip()

def unprinable(text):
     return re.sub(r'[\u200B]','',text).strip()
    
def replace_apostrophes(text):
    return re.sub('&#39;|’|´|‘', "'", (str(text)))

templist = [] 
for i in df["Pos Tag"]:
    i = remove_newline(i)
    i = unicode_problem(i)
    i = unprinable(i)
    i = replace_apostrophes(i)
    i = i.strip()
    i = convert_string2_list(i)
    templist.append(i)
    

In [22]:
templist[0:10]

[[('In', 'ADP'),
  ('a', 'DET'),
  ('webinar', 'NOUN'),
  ('last', 'ADJ'),
  ('week', 'NOUN'),
  (',', 'PUNCT'),
  ('Capital', 'PROPN'),
  ('Economics', 'PROPN'),
  ("\\'", 'PART'),
  ('global', 'ADJ'),
  ('economists', 'NOUN'),
  ('lumped', 'VERB'),
  ('together', 'ADV'),
  ('the', 'DET'),
  ('Philippines', 'PROPN'),
  (',', 'PUNCT'),
  ('Thailand', 'PROPN'),
  (',', 'PUNCT'),
  ('Mexico', 'PROPN'),
  (',', 'PUNCT'),
  ('and', 'CCONJ'),
  ('Southern', 'PROPN'),
  ('Europe', 'PROPN'),
  ('among', 'ADP'),
  ('the', 'DET'),
  ('economies', 'NOUN'),
  ('which', 'DET'),
  ('would', 'AUX'),
  ('most', 'ADV'),
  ('likely', 'ADJ'),
  ('experience', 'NOUN'),
  ('permanent', 'ADJ'),
  ('loss', 'NOUN'),
  ('of', 'ADP'),
  ('output', 'NOUN'),
  ('from', 'ADP'),
  ('their', 'PRON'),
  ('respective', 'ADJ'),
  ('domestic', 'ADJ'),
  ('tourism', 'NOUN'),
  ('sectors', 'NOUN'),
  ('due', 'ADP'),
  ('to', 'ADP'),
  ('the', 'DET'),
  ('pandemic', 'NOUN'),
  ('.', 'PUNCT')],
 [('I', 'PRON'),
  ('think',

In [12]:
nltk_data = templist + full_treebank
# nltk_data = templist
len(nltk_data)

6318

In [16]:
nltk_data[0:100]

[[('In', 'ADP'),
  ('a', 'DET'),
  ('webinar', 'NOUN'),
  ('last', 'ADJ'),
  ('week', 'NOUN'),
  (',', 'PUNCT'),
  ('Capital', 'PROPN'),
  ('Economics', 'PROPN'),
  ("\\'", 'PART'),
  ('global', 'ADJ'),
  ('economists', 'NOUN'),
  ('lumped', 'VERB'),
  ('together', 'ADV'),
  ('the', 'DET'),
  ('Philippines', 'PROPN'),
  (',', 'PUNCT'),
  ('Thailand', 'PROPN'),
  (',', 'PUNCT'),
  ('Mexico', 'PROPN'),
  (',', 'PUNCT'),
  ('and', 'CCONJ'),
  ('Southern', 'PROPN'),
  ('Europe', 'PROPN'),
  ('among', 'ADP'),
  ('the', 'DET'),
  ('economies', 'NOUN'),
  ('which', 'DET'),
  ('would', 'AUX'),
  ('most', 'ADV'),
  ('likely', 'ADJ'),
  ('experience', 'NOUN'),
  ('permanent', 'ADJ'),
  ('loss', 'NOUN'),
  ('of', 'ADP'),
  ('output', 'NOUN'),
  ('from', 'ADP'),
  ('their', 'PRON'),
  ('respective', 'ADJ'),
  ('domestic', 'ADJ'),
  ('tourism', 'NOUN'),
  ('sectors', 'NOUN'),
  ('due', 'ADP'),
  ('to', 'ADP'),
  ('the', 'DET'),
  ('pandemic', 'NOUN'),
  ('.', 'PUNCT')],
 [('I', 'PRON'),
  ('think',

In [21]:
nltk_data[0][1]

('a', 'DET')

In [20]:
seniors_training_data = nltk_data
std_df = pd.DataFrame({'tagged':seniors_training_data})
std_df.info()
# std_df.to_csv("seniors_data.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6318 entries, 0 to 6317
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagged  6318 non-null   object
dtypes: object(1)
memory usage: 49.5+ KB


In [13]:
def check_difference(hmm_result, crf_result):
    if len(hmm_result) == len(crf_result):

        arr_CRF = np.array(crf_result)
        arr_HMM = np.array(hmm_result)

        different_list = []
        counter = 0 
        different_tag = 0 
        equal_tag = 0
        arr_element = 0 

        if (np.array_equal(arr_HMM, arr_CRF)) == False and (len(arr_CRF) == len(arr_HMM)):
            arr_element = len(arr_CRF)

            for j in range(arr_element):

                if np.array_equal(arr_HMM[j], arr_CRF[j]) == False:
                    # using * operator to concat
                    temp =[*arr_HMM[j], *arr_CRF[j]]
                    # del func duplicate comment in the list
                    del temp[2]
                    counter += 1 
                    different_list.append(temp)
            different_tag = counter
            equal_tag = arr_element - different_tag
    else:
        different_list = []
        different_tag = 0 
        print("tag length is different")
            
    return different_list, different_tag

In [14]:
def features(sentence, index):
    # """ sentence: [w1, w2, ...], index: the index of the word """
    
    feature_set =  {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'prefix-4': sentence[index][:4],
        'prefix-5': sentence[index][:5],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'suffix-4': sentence[index][-4:],
        'suffix-5': sentence[index][-5:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'prev2_word': '' if index == 0 else sentence[index - 2],
        'next2_word': '' if index == len(sentence) - 2 or index == len(sentence) - 1  else sentence[index + 2],
        'prev3_word': '' if index == 0 else sentence[index - 3],
        'next3_word': '' if index == len(sentence) - 2 or index == len(sentence) - 1  or index == len(sentence) - 3  else sentence[index + 3],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
        'natural_number': (re.findall(r'^[0-9]+', sentence[index])),
        'initcaps' : (re.findall(r'^[A-Z]\w+', sentence[index])),
        'initcapsalpha': (re.findall(r'^[A-Z][a-z]\w+', sentence[index])),
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', sentence[index].lower()),
        'word.ispunctuation': (sentence[index] in string.punctuation)
    }
    
    if index <= 0:
        feature_set['BOS'] = True
    
    if index > len(sentence)-1:
        feature_set['EOS'] = True
        
    return feature_set

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

def pos_tag(sentence, model):
    sentence = sentence_splitter(sentence)
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, model.predict([sentence_features])[0]))

def sentence_splitter(sentence):
    result = []
    sents = word_tokenize(sentence)
    for s in sents:
        if re.findall(r'[\u4e00-\u9fff]+', s):
            s = HanziConv.toSimplified(s)
            result = result + list(jieba.cut(s, cut_all=False))
        else:
            result.append(s)
    return result

# Split the dataset for training and testing
cutoff = int(.80 * len(nltk_data))
training_sentences = nltk_data[:cutoff]
test_sentences = nltk_data[cutoff:]

X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

# print(cutoff)
print(len(X_train))     
print(len(X_test))         
# print(X_test[0])

5054
1264


In [15]:
# arow 

In [16]:
# CRF_model_pa = sklearn_crfsuite.CRF(
#     algorithm = 'pa',
#     max_iterations = 100,
#     all_possible_transitions=True
#     # c1 = 0.25,
#     # c2 = 0.35
# )

# CRF_model_pa.fit(X_train, y_train)

# CRF_model_ap = sklearn_crfsuite.CRF(
#     algorithm = 'ap',
#     max_iterations = 100,
#     all_possible_transitions=True
#     # c1 = 0.25,
#     # c2 = 0.35
# )
# CRF_model_ap.fit(X_train, y_train)


# CRF_model_arow = sklearn_crfsuite.CRF(
#     algorithm = 'arow',
#     max_iterations = 100,
#     all_possible_transitions=True
#     # c1 = 0.25,
#     # c2 = 0.35
# )
# CRF_model_arow.fit(X_train, y_train)

# CRF_model_l2sgd = sklearn_crfsuite.CRF(
#     algorithm = 'l2sgd',
#     max_iterations = 100,
#     all_possible_transitions=True
#     # c1 = 0.25,
#     # c2 = 0.35
# )
# CRF_model_l2sgd.fit(X_train, y_train)


CRF_model_lbfgs = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    max_iterations = 100,
    all_possible_transitions=True,
    c1 = 0.25,
    c2 = 0.35
)
CRF_model_lbfgs.fit(X_train, y_train)




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.25, c2=0.35,
    keep_tempfiles=None, max_iterations=100)

In [17]:
labels = list(CRF_model_lbfgs.classes_)
# labels.remove('O')
print(labels)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(CRF_model_lbfgs, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

['ADP', 'DET', 'NOUN', 'ADJ', 'PUNCT', 'PROPN', 'PART', 'VERB', 'ADV', 'CCONJ', 'AUX', 'PRON', 'X', 'INTJ', 'SCONJ', 'NUM', 'SYM']
Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.5min
  _warn_prf(
  _warn_prf(
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  7.2min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True, c1=0.25,
                                 c2=0.35, keep_tempfiles=None,
                                 max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9ec66f7af0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9ec6543670>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['ADP', 'DET', 'NOUN', 'ADJ', 'PUNCT', 'PROPN', 'PART', 'VERB', 'ADV', 'CCONJ', 'AUX', 'PRON', 'X', 'INTJ', 'SCONJ', 'NUM', 'SYM']),
                   verbose=1)

In [18]:
from sklearn.model_selection import GridSearchCV
params_space = {
    "c1": [0,0.05,0.1, 0.25,0.5,1],
    "c2": [0,0.05,0.1, 0.25,0.5,1]
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
grid_search = GridSearchCV(estimator=CRF_model_lbfgs,
                           param_grid=params_space,
                           cv=3,
                           n_jobs=-1, verbose=1,scoring=f1_scorer)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min
  _warn_prf(
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  7.9min finished


GridSearchCV(cv=3,
             estimator=CRF(algorithm='lbfgs', all_possible_transitions=True,
                           c1=0.25, c2=0.35, keep_tempfiles=None,
                           max_iterations=100),
             n_jobs=-1,
             param_grid={'c1': [0, 0.05, 0.1, 0.25, 0.5, 1],
                         'c2': [0, 0.05, 0.1, 0.25, 0.5, 1]},
             scoring=make_scorer(flat_f1_score, average=weighted, labels=['ADP', 'DET', 'NOUN', 'ADJ', 'PUNCT', 'PROPN', 'PART', 'VERB', 'ADV', 'CCONJ', 'AUX', 'PRON', 'X', 'INTJ', 'SCONJ', 'NUM', 'SYM']),
             verbose=1)

In [19]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

In [21]:
y_pred = grid_search.predict(X_test)
print("CRF_model_lbfgs : ", round(metrics.flat_accuracy_score(y_test, y_pred)*100, 4))
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

CRF_model_lbfgs :  96.9008


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           X      0.997     1.000     0.998      2127
        PART      0.996     1.000     0.998       991
       CCONJ      0.999     0.996     0.997       683
       SCONJ      0.000     0.000     0.000         0
         ADJ      0.868     0.897     0.883      1988
         ADP      0.982     0.982     0.982      3153
         ADV      0.866     0.878     0.872       793
        VERB      0.967     0.962     0.965      4206
         DET      0.991     0.994     0.992      2837
        INTJ      0.000     0.000     0.000         0
        NOUN      0.955     0.949     0.952      6313
        PRON      0.994     1.000     0.997       479
       PROPN      0.972     0.961     0.966      2918
         NUM      0.999     0.994     0.997      1599
       PUNCT      1.000     1.000     1.000      3273
         AUX      0.000     0.000     0.000         0
         SYM      1.000     1.000     1.000       326

   micro avg      0.969   

In [None]:

from sklearn_crfsuite import metrics

y_pred_lbfgs = CRF_model_lbfgs.predict(X_test)
print("CRF_model_lbfgs : ", round(metrics.flat_accuracy_score(y_test, y_pred_lbfgs)*100, 4))
print("CRF_model_lbfgs    : \n", metrics.flat_classification_report(y_test, y_pred_lbfgs), "\n")

# y_pred_l2sgd = CRF_model_l2sgd.predict(X_test)
# print("CRF_model_l2sgd : ", round(metrics.flat_accuracy_score(y_test, y_pred_l2sgd)*100, 4))
# print("CRF_model_l2sgd    : \n", metrics.flat_classification_report(y_test, y_pred_l2sgd), "\n")

# y_pred_arow = CRF_model_arow.predict(X_test)
# print("CRF_model_arow  : ", round(metrics.flat_accuracy_score(y_test, y_pred_arow)*100, 4))
# print("CRF_model_arow    : \n", metrics.flat_classification_report(y_test, y_pred_arow), "\n")

# y_pred_pa = CRF_model_pa.predict(X_test)
# print("CRF_model_pa    : ", round(metrics.flat_accuracy_score(y_test, y_pred_pa)*100, 4))
# print("CRF_model_pa    : \n", metrics.flat_classification_report(y_test, y_pred_pa), "\n")

# y_pred_ap = CRF_model_ap.predict(X_test)
# print("CRF_model_ap    : ", round(metrics.flat_accuracy_score(y_test, y_pred_ap)*100, 4))
# print("CRF_model_ap    : \n", metrics.flat_classification_report(y_test, y_pred_ap), "\n")

# print(metrics.sequence_accuracy_score(y_test, y_pred_lbfgs)*100)
# https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/metrics.html
# CRF_model_lbfgs :  97.664
# CRF_model_l2sgd :  97.3046
# CRF_model_arow  :  99.7604
# CRF_model_pa    :  99.7904
# CRF_model_ap    :  99.8203

In [None]:
print("CRF_model_lbfgs : ", round(metrics.flat_accuracy_score(y_test, y_pred_lbfgs)*100, 4))
# print("CRF_model_l2sgd : ", round(metrics.flat_accuracy_score(y_test, y_pred_l2sgd)*100, 4))
# print("CRF_model_arow  : ", round(metrics.flat_accuracy_score(y_test, y_pred_arow)*100, 4))
# print("CRF_model_pa    : ", round(metrics.flat_accuracy_score(y_test, y_pred_pa)*100, 4))
# print("CRF_model_ap    : ", round(metrics.flat_accuracy_score(y_test, y_pred_ap)*100, 4))

In [None]:
sentence = "semalan im i makan jadi so happy dengan holiday"
# Arang sampai dgn keadaan cantikkk
# Malas nak amik gambar hehe

crf_result_lbfgs = pos_tag(sentence, CRF_model_lbfgs)

print("CRF lbfgs :", crf_result_lbfgs) 


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import re 
import preprocessor as p


def sentence_tokenize(text):
    
    combine = [] 
    token_list = []
    full_sent = ""
    sent = ""
    sent_token = sent_tokenize(text)
    
    for sentence in sent_token: 
        reg_sent_token = re.split('\.|\;{2,}', sentence)
        # print(reg_sent_token)
        
        for reg_sentence in reg_sent_token:
            if reg_sentence != '':
                reg_sentence = reg_sentence.strip()
                token_list.append(reg_sentence) 

#     sentence token 
    return token_list

def unicode_problem(text):
    return re.sub(r'[\u0080]','',text).strip()

## Remove special character (\r)
def remove_slashR(text):
    return re.sub('\r', ' ', text)

def remove_special_char(text):
    return re.sub('&quot;|"|“|”', '', text)

def remove_multiple_space(text):
    return re.sub('\s+', ' ', text).strip()

# Replace newline
def remove_newline(text):
    return re.sub(r'\n', ' ', text)

# Replace apostrophe's special characters to original apostrophe
def replace_apostrophes(text):
    return re.sub('&#39;|’|´|‘', "'", text)

def remove_multiple_comma(text):
    return re.sub(r'[,]{2,}',',', text)

def remove_multiple_dot(text):
    return re.sub(r'[.]{3,}','', text)

def preprocess_tweet_fn(text):
    p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION)
    return p.tokenize(text) 

## Convert traditional Chinese characters to Simplified Chinese characters
def convert_Tra_Simp_Chi(text):
    return HanziConv.toSimplified(text)

def text_precessing(text):
    textlist = []
    
    text = str(text)
    text = sentence_tokenize(text)
    for i in text:
        i = unicode_problem(i)
        i = remove_slashR(i)
        i = remove_special_char(i)
        i = remove_multiple_space(i)
        i = remove_newline(i)
        i = replace_apostrophes(i)
        i = remove_multiple_comma(i)
        i = remove_multiple_dot(i)
        i = preprocess_tweet_fn(i)
        i = convert_Tra_Simp_Chi(i)
        
        textlist.append(i)
    return textlist 
    

In [None]:
text = "tems received with good condition. Thanks.       "
text_precessing(text)

In [None]:
test_df = df_Alldata['Sentence'].sample(n=5000, random_state=40)

sentence_list = []

for i in test_df:
    sentence_token = text_precessing(i)
    for s in sentence_token:
        if s != '':
            sentence_list.append(s)

            
data = {'sentence':sentence_list
       }   
new_df = pd.DataFrame(data)
sentence_df = new_df['sentence']
sentence_df


In [None]:
# CRF_model_pa

from nltk.tokenize import word_tokenize

# test_df = df_Alldata['Sentence'].sample(n=5000, random_state=42)


# pred_tags_HMM_List = []
pred_tags_CRF_List_lbfgs = []
pred_tags_CRF_List_l2sgd = []
pred_tags_CRF_List_arow = []
pred_tags_CRF_List_pa = []
pred_tags_CRF_List_ap = []

different_tag_List = []
different_num_List = []


for i in sentence_df: 
    test_sent = i.lower()
    
    crf_result_pa = pos_tag(test_sent, CRF_model_lbfgs)

    pred_tags_CRF_List_lbfgs.append(crf_result_pa)


data = {'sentence':sentence_df,
        'CRF plbfgs':pred_tags_CRF_List_lbfgs
       }   
    

postag_df = pd.DataFrame(data)

postag_df = postag_df.drop_duplicates(subset=['sentence'])
postag_df.to_csv("CRF pos tag (lbfgs).csv", index = False)

In [None]:
import pandas as pd 

df = pd.read_csv("CRF pos tag (lbfgs).csv")
total = len(df)
divide = int(total/2)
print(total, divide)
df1 = df.iloc[:divide]
df2 = df.iloc[divide:]
df1.to_csv("CRF pos tag 1.csv", index = False)
# df2.to_csv("CRF pos tag 2.csv", index = False)

In [None]:
# from nltk.tokenize import word_tokenize

# test_df = df_Alldata['Sentence'].sample(n=5000, random_state=42)

# # pred_tags_HMM_List = []
# pred_tags_CRF_List_lbfgs = []
# pred_tags_CRF_List_l2sgd = []
# pred_tags_CRF_List_arow = []
# pred_tags_CRF_List_pa = []
# pred_tags_CRF_List_ap = []

# different_tag_List = []
# different_num_List = []

# for i in test_df: 
#     test_sent = i
    
#     # hmm_result = Viterbi(sentence_splitter(test_sent))
#     crf_result_lbfgs = pos_tag(test_sent, CRF_model_lbfgs)
#     crf_result_l2sgd = pos_tag(test_sent, CRF_model_l2sgd)
#     crf_result_arow = pos_tag(test_sent, CRF_model_arrow)
#     crf_result_pa = pos_tag(test_sent, CRF_model_pa)
#     crf_result_ap = pos_tag(test_sent, CRF_model_ap)
    
#     different_list, different_num = check_difference(crf_result_lbfgs, crf_result_l2sgd)
    
#     # pred_tags_HMM_List.append(hmm_result)
#     pred_tags_CRF_List_lbfgs.append(crf_result_lbfgs)
#     pred_tags_CRF_List_l2sgd.append(crf_result_l2sgd)
#     pred_tags_CRF_List_arow.append(crf_result_arow)
#     pred_tags_CRF_List_pa.append(crf_result_pa)
#     pred_tags_CRF_List_ap.append(crf_result_ap)
#     different_num_List.append(different_num)
#     different_tag_List.append(different_list)

    
    
# data = {'sentence':test_df,
#         'CRF lbfgs':pred_tags_CRF_List_lbfgs,
#         'CRF l2sgd':pred_tags_CRF_List_l2sgd,
#         'No. of difference lbfgs vs l2sgd' : different_num_List,
#         'Different Tags of lbfgs vs l2sgd': different_tag_List,
#         'CRF arow':pred_tags_CRF_List_arow,
#         'CRF pa':pred_tags_CRF_List_pa,
#         'CRF ap':pred_tags_CRF_List_ap}
#         # 'HMM':pred_tags_HMM_List,
#         # 'No. of difference':different_num_List,
#         # 'Different Tags': different_tag_List}

# new_df = pd.DataFrame(data)
# new_df

# new_df.to_csv("CRF parameters.csv", index = False)

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(CRF_model_lbfgs.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(CRF_model_lbfgs.transition_features_).most_common()[-20:])

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(CRF_model_lbfgs.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(CRF_model_lbfgs.state_features_).most_common()[-30:])

In [None]:
import eli5

In [None]:
eli5.show_weights(CRF_model_lbfgs, top=30)