In [66]:
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import csv
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

ps = PorterStemmer()
stop_words = set(stopwords.words('english')) 
sp = spacy.load('en_core_web_sm')

### Load and preprocess data

In [67]:
def preprocess_text(sentence):
    sentence = sentence.lower()
    word_tokens = word_tokenize(sentence)
    sentence = [w for w in word_tokens if not w in stop_words]
    sentence = [ps.stem(w) for w in word_tokens]
    return " ".join(sentence)

In [103]:
data = open('auto_scraper/results/dataset_no_marked_words.csv', 'r', encoding='utf8').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split('|')
    labels.append(content[0])
    sentence = " ".join(content[1:])
#     texts.append(preprocess_text(sentence))
    texts.append(sentence)

df = pandas.DataFrame()
df['text'] = texts
df['label'] = labels
df = df.astype({'label': 'int64'})
df.head()

Unnamed: 0,text,label
0,Here’s my rolling list of all the places that ...,0
1,"restaurant details: website , IG I finally mad...",1
2,"restaurant details: website , IG I am so glad ...",0
3,"restaurant details : website , IG It’s now bee...",0
4,Other visits: Nov 2018 • April 2018 • Nov 2017...,1


### Create new features

In [104]:
i = 0
txt = df['text'][i]
label = df['label'][i]
sen = sp(txt)
print(label, txt)
ents = [x for x in sen.ents if x.label_ in ['PERSON', 'ORG']]
for entity in ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

0 Here’s my rolling list of all the places that are present in my mind so far. I will update it periodically as I discover new things. It is by no means exhaustive, these are places I have been in the past, and a few of…


In [105]:
df['entities'] = df['text'].apply(lambda x: [e.label_ for e in sp(x).ents])
df['person'] = df['entities'].apply(lambda x: int('PERSON' in x))
df['org'] = df['entities'].apply(lambda x: int('ORG' in x))
df = df.drop(columns=['entities'], axis=1)

In [98]:
df.head(n=3)

Unnamed: 0,text,label,person,org
0,Here’s my rolling list of all the places that ...,0,0,0
1,"restaurant details: website , IG I finally mad...",1,1,0
2,"restaurant details: website , IG I am so glad ...",0,1,1


In [120]:
TN_person = len(df[(df['label'] == df['person']) & (df['label'] == 0)])
TP_person = len(df[(df['label'] == df['person']) & (df['label'] == 1)])
FN_person = len(df[(df['label'] != df['person']) & (df['label'] == 1)])
FP_person = len(df[(df['label'] != df['person']) & (df['label'] == 0)])

In [None]:
TN_org = len(df[(df['label'] == df['org']) & (df['label'] == 0)])
TP_org= len(df[(df['label'] == df['org']) & (df['label'] == 1)])
FN_org = len(df[(df['label'] != df['org']) & (df['label'] == 1)])
FP_org = len(df[(df['label'] != df['org']) & (df['label'] == 0)])

In [None]:
TN_any = len(df[((df['label'] == df['org']) | (df['label'] == df['person'])) & (df['label'] == 0)])
TP_any = len(df[((df['label'] == df['org']) | (df['label'] == df['person'])) & (df['label'] == 1)])
FN_any = len(df[((df['label'] != df['org']) & (df['label'] != df['person'])) & (df['label'] == 1)])
FP_any = len(df[((df['label'] != df['org']) | (df['label'] != df['person'])) & (df['label'] == 0)])

In [None]:
print(f"PERSON = label: ACC={(TP_person + TN_person)/len(df):.2f} PREC={TP_person/(TP_person+FP_person):.2f} REC={TP_person/(TP_person+FP_person):.2f}")
print(f"ORG = label: ACC={(TP_org + TN_org)/len(df):.2f} PREC={TP_org/(TP_org+FP_org):.2f} REC={TP_org/(TP_org+FP_org):.2f}")
print(f"PERSON or ORG = label: ACC={(TP_any + TN_any)/len(df):.2f} PREC={TP_any/(TP_any + FP_any):.2f} REC={TP_any/(TP_any + FP_any):.2f}")

### Create model on the data

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [118]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [123]:
TN_any = len(df[((df['label'] == df['org']) | (df['label'] == df['person'])) & (df['label'] == 0)])
TP_any = len(df[((df['label'] == df['org']) | (df['label'] == df['person'])) & (df['label'] == 1)])
FN_any = len(df[((df['label'] != df['org']) & (df['label'] != df['person'])) & (df['label'] == 1)])
FP_any = len(df[((df['label'] != df['org']) | (df['label'] != df['person'])) & (df['label'] == 0)])

In [124]:
print(f"PERSON = label: ACC={(TP_person + TN_person)/len(df):.2f} PREC={TP_person/(TP_person+FP_person):.2f} REC={TP_person/(TP_person+FP_person):.2f}")
print(f"ORG = label: ACC={(TP_org + TN_org)/len(df):.2f} PREC={TP_org/(TP_org+FP_org):.2f} REC={TP_org/(TP_org+FP_org):.2f}")
print(f"PERSON or ORG = label: ACC={(TP_any + TN_any)/len(df):.2f} PREC={TP_any/(TP_any + FP_any):.2f} REC={TP_any/(TP_any + FP_any):.2f}")

PERSON = label: ACC=0.67 PREC=0.64 REC=0.64
ORG = label: ACC=0.56 PREC=0.57 REC=0.57
PERSON or ORG = label: ACC=0.86 PREC=0.59 REC=0.59


### Create model on the data

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [90]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

array(['0', '1', ''], dtype=object)

In [None]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)