# Creating SVM model from CrisiLex data - inf_source classification

In [1]:
import spacy
import pandas as pd
import numpy as np
from langdetect import detect_langs
from langdetect import detect
import sys
from spacy.matcher import Matcher
# scikit-learn version 0.22.2.post1
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from timeit import default_timer as timer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import pickle
import os
import os.path
from spacy_langdetect import LanguageDetector
import re
from gensim.models import KeyedVectors
import numpy as np

In [2]:
#laod sm vectors
nlp = spacy.load("en_core_web_sm")

In [3]:
# read all CrisiLex data from subdirectories
# data from https://github.com/sajao/CrisisLex/tree/master/data/CrisisLexT26
df_all = pd.DataFrame(columns = ['tweet_id', 'tweet_text', 'inf_source', 'inf_type', 'informativeness'])
for dirpath, dirnames, filenames in os.walk("CrisisLexT26"):
    for filename in [f for f in filenames if f.endswith("labeled.csv")]:
        print (os.path.join(dirpath, filename))
        if df_all.empty:
            print('all')
            df_all = pd.read_csv(os.path.join(dirpath, filename), names = ['tweet_id', 'tweet_text', 'inf_source', 'inf_type', 'informativeness'])
        else:
            print('new')
            df_new = pd.read_csv(os.path.join(dirpath, filename), names = ['tweet_id', 'tweet_text', 'inf_source', 'inf_type', 'informativeness'])
            df_all = pd.concat([df_all, df_new])

CrisisLexT26/2012_Venezuela_refinery/2012_Venezuela_refinery-tweets_labeled.csv
all
CrisisLexT26/2013_Australia_bushfire/2013_Australia_bushfire-tweets_labeled.csv
new
CrisisLexT26/2013_NY_train_crash/2013_NY_train_crash-tweets_labeled.csv
new
CrisisLexT26/2013_Brazil_nightclub_fire/2013_Brazil_nightclub_fire-tweets_labeled.csv
new
CrisisLexT26/2013_Alberta_floods/2013_Alberta_floods-tweets_labeled.csv
new
CrisisLexT26/2012_Philipinnes_floods/2012_Philipinnes_floods-tweets_labeled.csv
new
CrisisLexT26/2013_Sardinia_floods/2013_Sardinia_floods-tweets_labeled.csv
new
CrisisLexT26/2012_Colorado_wildfires/2012_Colorado_wildfires-tweets_labeled.csv
new
CrisisLexT26/2013_Singapore_haze/2013_Singapore_haze-tweets_labeled.csv
new
CrisisLexT26/2013_Colorado_floods/2013_Colorado_floods-tweets_labeled.csv
new
CrisisLexT26/2013_LA_airport_shootings/2013_LA_airport_shootings-tweets_labeled.csv
new
CrisisLexT26/2013_Lac_Megantic_train_crash/2013_Lac_Megantic_train_crash-tweets_labeled.csv
new
Crisis

In [4]:
# detect language from text
def tweet_parser_lang(tweet):
    doc = nlp(tweet)

    #https://issue.life/questions/43388476
    # hasthags in single token (normally the #sigh is standalone token)
    matcher = Matcher(nlp.vocab)
    matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
    matches = matcher(doc)
    hashtags = []

    for match_id, start, end in matches:
        hashtags.append(doc[start:end])
    for span in hashtags:
        span.merge()

    new_list=[]
    for token in doc:
        if token.text == 'RT': continue
        #remove hashtags
        if token.text.startswith('#'): 
            continue
        if token.text.startswith('@'):
            continue
        if token.pos_ == 'NUM': 
            continue  
        if token.like_url: 
            continue
        if token.is_space: continue

        new_list.append(token.text.lower())
    return (' '.join(new_list))   

In [5]:
# parse all tweets and detect language

tweets = df_all['tweet_text'].values.tolist()
if 'language_detector' not in nlp.pipe_names: 
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        
i=0
list_lang_spacy=[]
counter = 0
logging = True
start = timer()
parsed_tweets=[]
all_entities=[]

for tweet in tweets:
    # parse tweet for language detection
    lang_parsed=tweet_parser_lang(tweet)
    doc=nlp(lang_parsed)
    list_lang_spacy.append(doc._.language["language"])
    

    if counter % 100 == 0 and logging:
        end = timer()
        print("Processed %d tweets (%f seconds per tweet)" % (counter, (end-start)/100))
        start = timer()
    counter += 1  
    #if counter > 300: break

df_all['language_spacy']=list_lang_spacy 

#just en tweets
en_tweets = df_all[(df_all.language_spacy == 'en')]

print(en_tweets.count())

Processed 0 tweets (0.003551 seconds per tweet)
Processed 100 tweets (0.029720 seconds per tweet)
Processed 200 tweets (0.028398 seconds per tweet)
Processed 300 tweets (0.027279 seconds per tweet)
Processed 400 tweets (0.028633 seconds per tweet)
Processed 500 tweets (0.027825 seconds per tweet)
Processed 600 tweets (0.026489 seconds per tweet)
Processed 700 tweets (0.027673 seconds per tweet)
Processed 800 tweets (0.027263 seconds per tweet)
Processed 900 tweets (0.029477 seconds per tweet)
Processed 1000 tweets (0.029726 seconds per tweet)
Processed 1100 tweets (0.029560 seconds per tweet)
Processed 1200 tweets (0.034884 seconds per tweet)
Processed 1300 tweets (0.031951 seconds per tweet)
Processed 1400 tweets (0.029006 seconds per tweet)
Processed 1500 tweets (0.031652 seconds per tweet)
Processed 1600 tweets (0.027667 seconds per tweet)
Processed 1700 tweets (0.030799 seconds per tweet)
Processed 1800 tweets (0.029175 seconds per tweet)
Processed 1900 tweets (0.037395 seconds per

Processed 16000 tweets (0.027953 seconds per tweet)
Processed 16100 tweets (0.019580 seconds per tweet)
Processed 16200 tweets (0.024283 seconds per tweet)
Processed 16300 tweets (0.021690 seconds per tweet)
Processed 16400 tweets (0.022808 seconds per tweet)
Processed 16500 tweets (0.022219 seconds per tweet)
Processed 16600 tweets (0.024693 seconds per tweet)
Processed 16700 tweets (0.022787 seconds per tweet)
Processed 16800 tweets (0.022723 seconds per tweet)
Processed 16900 tweets (0.026228 seconds per tweet)
Processed 17000 tweets (0.024572 seconds per tweet)
Processed 17100 tweets (0.029165 seconds per tweet)
Processed 17200 tweets (0.027745 seconds per tweet)
Processed 17300 tweets (0.027590 seconds per tweet)
Processed 17400 tweets (0.025231 seconds per tweet)
Processed 17500 tweets (0.027430 seconds per tweet)
Processed 17600 tweets (0.027790 seconds per tweet)
Processed 17700 tweets (0.028597 seconds per tweet)
Processed 17800 tweets (0.027211 seconds per tweet)
Processed 17

In [7]:
# save 
df_all.to_pickle("crisislex_df_all_lang.pkl")
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27959 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_id         27959 non-null  object
 1   tweet_text       27959 non-null  object
 2   inf_source       27959 non-null  object
 3   inf_type         27959 non-null  object
 4   informativeness  27959 non-null  object
 5   language_spacy   27959 non-null  object
dtypes: object(6)
memory usage: 1.5+ MB


In [3]:
#  load pickle
df_all = pd.read_pickle("crisislex_df_all_lang.pkl")
en_tweets = df_all[(df_all.language_spacy == 'en')]
df_all.info()

#just en tweets
en_tweets = df_all[(df_all.language_spacy == 'en')]
print(en_tweets.count())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27959 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_id         27959 non-null  object
 1   tweet_text       27959 non-null  object
 2   inf_source       27959 non-null  object
 3   inf_type         27959 non-null  object
 4   informativeness  27959 non-null  object
 5   language_spacy   27959 non-null  object
dtypes: object(6)
memory usage: 1.5+ MB
tweet_id           19335
tweet_text         19335
inf_source         19335
inf_type           19335
informativeness    19335
language_spacy     19335
dtype: int64


In [4]:
# clean tweet text
def tweet_parser_lemma(tweet_to_parse):
        """ parse tweet text for classification """
        tweet = tweet_to_parse

        tweet = re.sub(r"#\w+", "", tweet)
        tweet = re.sub(r"^RT\s", "", tweet)
        tweet = re.sub(r"@\w+", "", tweet)
        doc = nlp(tweet)
        new_list = []
        for token in doc:
            if token.pos_ == 'NUM': continue
            if token.like_url: continue
            if token.is_space: continue
            if token.is_punct: continue
            if token.is_stop: continue
            #small tokens
            if len(token.text) < 3: continue
            new_list.append(token.lemma_.lower())
        return ' '.join(new_list)

In [5]:
# load EN word2vec model
model = KeyedVectors.load_word2vec_format('/home/dzon/kajo/new2/semexp-data/data/w2v-jiri-slim-extended.bin', binary=True)
model.init_sims(replace=True)

In [6]:
# get document vector - average of all tokens' w2v vectors
def get_doc_vector(text):
    #doc=nlp(text)
    doc=text.lower().split(" ")
    vector_final = np.zeros(model.vector_size, dtype=np.float32)
    tokens_count = 0
    for token in doc:
        #token = t.text.lower()
        if token in model:
            vector_final = np.add(model[token], vector_final)
            tokens_count += 1
        else:
            pass
            #print(token, "not found in model")
    if tokens_count:
        return vector_final / tokens_count
    else:
        return vector_final

In [7]:
# remove not labeled
df_en_inf_source = en_tweets[en_tweets['inf_source'] != 'Not labeled']
df_en_inf_source.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17404 entries, 21 to 999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_id         17404 non-null  object
 1   tweet_text       17404 non-null  object
 2   inf_source       17404 non-null  object
 3   inf_type         17404 non-null  object
 4   informativeness  17404 non-null  object
 5   language_spacy   17404 non-null  object
dtypes: object(6)
memory usage: 951.8+ KB


In [8]:
# clean texts
start = timer()
df_en_inf_source['parsed_text_lemma'] = df_en_inf_source.tweet_text.apply(tweet_parser_lemma)
end = timer()
print(end-start)

# 102.41539139300585

107.08378340099989


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
# remove duplicates
df_en_inf_source.drop_duplicates(subset ="parsed_text_lemma", 
                     keep = False, inplace = True) 
df_en_inf_source.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13531 entries, 21 to 999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tweet_id           13531 non-null  object
 1   tweet_text         13531 non-null  object
 2   inf_source         13531 non-null  object
 3   inf_type           13531 non-null  object
 4   informativeness    13531 non-null  object
 5   language_spacy     13531 non-null  object
 6   parsed_text_lemma  13531 non-null  object
dtypes: object(7)
memory usage: 845.7+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# get doc vectors for all tweets
start = timer()
df_en_inf_source['doc_vector_gensim_lemma'] = df_en_inf_source.parsed_text_lemma.apply(get_doc_vector)
end = timer()
print(end-start)

# 0.27816161201917566

0.28853491299923917


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# clean zero vectors
def is_np_zero(doc_vector):
    if not np.any(doc_vector):
        return True
    else:
        return False

df_en_inf_source['doc_vector_gensim_zero'] = df_en_inf_source.doc_vector_gensim_lemma.apply(is_np_zero)

df_en_inf_source2 = df_en_inf_source[df_en_inf_source['doc_vector_gensim_zero'] == False]
df_en_inf_source2.info()
df_en_inf_source = df_en_inf_source2

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13523 entries, 21 to 999
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   tweet_id                 13523 non-null  object
 1   tweet_text               13523 non-null  object
 2   inf_source               13523 non-null  object
 3   inf_type                 13523 non-null  object
 4   informativeness          13523 non-null  object
 5   language_spacy           13523 non-null  object
 6   parsed_text_lemma        13523 non-null  object
 7   doc_vector_gensim_lemma  13523 non-null  object
 8   doc_vector_gensim_zero   13523 non-null  bool  
dtypes: bool(1), object(8)
memory usage: 964.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
# find C parameter for SVM
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

X_train, X_val, y_train, y_val = train_test_split(df_en_inf_source['doc_vector_gensim_lemma'].tolist(), df_en_inf_source['inf_source'].tolist(), test_size=0.2, random_state=1)

parameters = {'C':[1.8, 2, 2.2, 2.4, 2.6]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, n_jobs=12)
clf.fit(X_train, y_train)

sorted(clf.cv_results_.keys())
clf.cv_results_

# best C is 2.2 with 0.8 train 0.62358964


{'mean_fit_time': array([154.97156501, 158.12615428, 158.07256265, 161.2266016 ,
        137.28413434]),
 'std_fit_time': array([ 1.58733124,  0.97356602,  2.16818184,  0.60226565, 47.60175943]),
 'mean_score_time': array([26.12726417, 26.189044  , 26.13410296, 25.00040588, 20.17209387]),
 'std_score_time': array([0.74810669, 0.76605027, 0.45104795, 0.57603632, 7.15916565]),
 'param_C': masked_array(data=[1.8, 2, 2.2, 2.4, 2.6],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1.8}, {'C': 2}, {'C': 2.2}, {'C': 2.4}, {'C': 2.6}],
 'split0_test_score': array([0.62338262, 0.62384473, 0.62430684, 0.62615527, 0.62707948]),
 'split1_test_score': array([0.62292052, 0.62707948, 0.62892791, 0.62892791, 0.6284658 ]),
 'split2_test_score': array([0.62338262, 0.62292052, 0.62476895, 0.62107209, 0.62014787]),
 'split3_test_score': array([0.61534905, 0.61488673, 0.61534905, 0.61488673, 0.61673601]),
 'split4_test_score': ar

In [29]:
#5-fold cross validation
start = timer()
clf = svm.SVC(probability = True, C=2.2)
#clf = OneVsRestClassifier(svm.SVC(probability = True, C=2.2), n_jobs=6)

score = cross_val_score(clf, df_en_inf_source['doc_vector_gensim_lemma'].tolist(), df_en_inf_source['inf_source'].tolist(), cv=5, n_jobs=12)
score_mean = score.mean()
print('cross validation accuracy: {} {}'.format(score_mean, score))
end = timer()
print('time of execution: {}'.format(end-start))

# svm.SVC()
#cross validation accuracy: 0.6030703413964448 [0.62655123 0.63182212 0.61184971 0.58571842 0.55941023]
#time of execution: 389.2026017999999

# onevsRest
#cross validation accuracy: 0.6052061666308802 [0.63290043 0.63904129 0.60578035 0.58514021 0.56316855]
#time of execution: 803.8628881

# OneVsRestClassifier(svm.SVC) with gensim vectors n_jobs=6
# cross validation accuracy: 0.5805079351688035 [0.60691824 0.61835334 0.59348199 0.56591364 0.51787246]
# time of execution: 1878.5939883560059

#sVC C 1.7, rbf lemma, stop_words left
#cross validation accuracy: 0.5058419459039138 [0.58810653 0.42028457 0.46041591 0.49653411 0.56386861]
#time of execution: 507.3958388370229

# lemma stop_words left OneVsRestClassifier(svm.SVC(probability = True, C=1.7), n_jobs=6)
#cross validation accuracy: 0.4950426212395177 [0.58591755 0.41627143 0.43232397 0.48157607 0.55912409]
#time of execution: 1305.4485383289866

# lemma gensim stop words removed clf = svm.SVC(probability = True, C=1.7)
#cross validation accuracy: 0.47866421158169636 [0.56619822 0.38054734 0.42455621 0.4818787  0.54014058]

# lemma, gensim, clf = OneVsRestClassifier(svm.SVC(probability = True, C=2.2), n_jobs=6)
# cross validation accuracy: 0.46395525489724265 [0.55859519 0.37079482 0.39556377 0.45710059 0.53772189]
#time of execution: 1371.3395205349952

# lemma, gensim clf = svm.SVC(probability = True, C=2.2)
#cross validation accuracy: 0.46964961882991174 [0.56192237 0.36598891 0.40924214 0.47300296 0.53809172]
#time of execution: 577.4775641279994

cross validation accuracy: 0.46964961882991174 [0.56192237 0.36598891 0.40924214 0.47300296 0.53809172]
time of execution: 577.4775641279994


In [30]:
# train the classificator

#X_train, X_val, y_train, y_val = train_test_split(df_en_inf_source['doc_vector_gensim'].tolist(), df_en_inf_source['inf_source'].tolist(), test_size=0.2, random_state=1)

X_train = df_en_inf_source['doc_vector_gensim_lemma'].tolist()
y_train = df_en_inf_source['inf_source'].tolist()

start = timer()
#clf = OneVsRestClassifier(svm.SVC(probability = True, C=2.2), n_jobs=12).fit(X_train, y_train)
clf = svm.SVC(probability = True, C=2.2).fit(X_train, y_train)
end = timer()
print(end-start)

# n_jobs = 12 OneVsRest
# 454.67108336198726

# svm.SVC(probability = True, C=1.7) full nodup - n_jobs not available
# 381.09414758501225

# svm.SVC(probability = True, C=2.2).fit(X_train, y_train)
# 398.8352744010044

398.8352744010044


In [12]:
# train the classificator

#X_train, X_val, y_train, y_val = train_test_split(df_en_inf_source['doc_vector_gensim'].tolist(), df_en_inf_source['inf_source'].tolist(), test_size=0.2, random_state=1)

X_train = df_en_inf_source['doc_vector_gensim_lemma'].tolist()
y_train = df_en_inf_source['inf_source'].tolist()

start = timer()
clf_ovr = OneVsRestClassifier(svm.SVC(probability = True, C=2.2), n_jobs=12).fit(X_train, y_train)

end = timer()
print(end-start)

# OneVsRestClassifier(svm.SVC(probability = True, C=2.2), n_jobs=12)
# 479.3168061140168


481.11828097000034


In [13]:
# predict
#df_en_inf_source['inf_source_predict_svc_gensim_lemma'] = clf.predict(df_en_inf_source['doc_vector_gensim_lemma'].tolist())
df_en_inf_source['inf_source_predict_svc_ovr_gensim_lemma'] = clf_ovr.predict(df_en_inf_source['doc_vector_gensim_lemma'].tolist())

In [14]:
# examine accuracy
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score

#print("SVC:")
#y_true = df_en_inf_source['inf_source']
#y_pred = df_en_inf_source['inf_source_predict_svc_gensim_lemma']
#print("accuracy: ", accuracy_score(y_true, y_pred))
#print("balanced_accuracy_score: ", balanced_accuracy_score(y_true, y_pred))
#print("f1 weighted: ",f1_score(y_true, y_pred, average='weighted'))
#print("precision_score: ",precision_score(y_true, y_pred, average='weighted'))
#print("recall_score: ",recall_score(y_true, y_pred, average='weighted'))


print("SVC OVR:")
y_true = df_en_inf_source['inf_source']
y_pred = df_en_inf_source['inf_source_predict_svc_ovr_gensim_lemma']
print("accuracy: ", accuracy_score(y_true, y_pred))
print("balanced_accuracy_score: ", balanced_accuracy_score(y_true, y_pred))
print("f1 weighted: ",f1_score(y_true, y_pred, average='weighted'))
print("precision_score: ",precision_score(y_true, y_pred, average='weighted'))
print("recall_score: ",recall_score(y_true, y_pred, average='weighted'))


# nodup gensim full
# accuracy:  0.7200029212006135
# balanced_accuracy_score:  0.4579787673009016
# f1 weighted:  0.7000478277689586
# precision_score:  0.7424681261869172
# recall_score:  0.7200029212006135

# cvm.SVC C:1.7
#accuracy:  0.7291316731176514
#balanced_accuracy_score:  0.43019976596449566
#f1 weighted:  0.7046239632759524
#precision_score:  0.7467371055740926
#recall_score:  0.7291316731176514

# svc C 1.7 lemma
#accuracy:  0.7414086172492794
#balanced_accuracy_score:  0.46293556557232174
#f1 weighted:  0.7207575861828118
#precision_score:  0.7617823631216263
#recall_score:  0.7414086172492794

# lemma with left stop words
#accuracy:  0.7259924109748979
#balanced_accuracy_score:  0.4257827643220438
#f1 weighted:  0.7008060844142024
#precision_score:  0.743965975009386
#recall_score:  0.7259924109748979

# C=2.2 SVC lemma, without stopwords, without duplicates, without zero vectors
#accuracy:  0.7686164312652518
#balanced_accuracy_score:  0.5224920473992108
#f1 weighted:  0.7540863378990941
#precision_score:  0.7851196477953029
#recall_score:  0.7686164312652518

# C=2.2 SVC OVR lemma, without stopwords, without duplicates, without zero vectors
#accuracy:  0.8089920875545367
#balanced_accuracy_score:  0.6620251424688365
#f1 weighted:  0.8039184872477424
#precision_score:  0.8230064098474439
#recall_score:  0.8089920875545367

SVC OVR:
accuracy:  0.8077349700510241
balanced_accuracy_score:  0.6494065418477704
f1 weighted:  0.8021917369598703
precision_score:  0.8214334854250069
recall_score:  0.8077349700510241


In [35]:
# save csv to investigate results
df_en_inf_source[['tweet_text','parsed_text_lemma','inf_source','inf_source_predict_svc_gensim_lemma', 'inf_source_predict_svc_ovr_gensim_lemma']].to_csv('CrisisLex_all_parsed_en_inf_source_predicted_gensim_SVC_C2.2.csv', sep="|")

In [15]:
# accuracy for all classses
correct_sum=0
uncorrect_sum=0
for unique in df_en_inf_source.inf_source.unique():
    #correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_onevsrest_gensim==unique)])
    #uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source!=unique) & (df_en_inf_source.inf_source_predict_onevsrest_gensim==unique)])
    #correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma==unique)])
    #uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source!=unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma==unique)])
    #correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma_stop==unique)])
    #uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source!=unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma_stop==unique)])
    correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_ovr_gensim_lemma==unique)])
    uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source!=unique) & (df_en_inf_source.inf_source_predict_svc_ovr_gensim_lemma==unique)])
    
    correct_sum+=correct
    uncorrect_sum+=uncorrect
    print("{} accuracy: {} (correct: {} uncorrect: {})".format(unique,correct/(correct+uncorrect),correct,uncorrect))
print("total accuracy: {} (correct: {}, uncorrect: {} total: {} {}".format(correct_sum/(correct_sum+uncorrect_sum), correct_sum, uncorrect_sum, (correct_sum+uncorrect_sum), len(df_en_inf_source.inf_source)))      

# svc C:1.7 full nodup nolemma
#Outsiders accuracy: 0.735352730171383 (correct: 3690 uncorrect: 1328)
#Media accuracy: 0.7155220547180347 (correct: 5126 uncorrect: 2038)
#Not applicable accuracy: 1.0 (correct: 15 uncorrect: 0)
#Government accuracy: 0.9 (correct: 99 uncorrect: 11)
#Eyewitness accuracy: 0.7936333699231614 (correct: 723 uncorrect: 188)
#Business accuracy: 1.0 (correct: 29 uncorrect: 0)
#NGOs accuracy: 0.6771300448430493 (correct: 302 uncorrect: 144)
#total accuracy: 0.7291316731176514 (correct: 9984, uncorrect: 3709 total: 13693 13693

# lemma
#Outsiders accuracy: 0.7763043478260869 (correct: 3571 uncorrect: 1029)
#Media accuracy: 0.706953642384106 (correct: 5124 uncorrect: 2124)
#Not applicable accuracy: 1.0 (correct: 20 uncorrect: 0)
#Government accuracy: 0.9236641221374046 (correct: 121 uncorrect: 10)
#Eyewitness accuracy: 0.8111332007952287 (correct: 816 uncorrect: 190)
#Business accuracy: 1.0 (correct: 33 uncorrect: 0)
#NGOs accuracy: 0.7038539553752535 (correct: 347 uncorrect: 146)
#total accuracy: 0.7414086172492794 (correct: 10032, uncorrect: 3499 total: 13531 13531

#lemma with left stop words
#Outsiders accuracy: 0.7390606182256122 (correct: 3682 uncorrect: 1300)
#Media accuracy: 0.7085643015521065 (correct: 5113 uncorrect: 2103)
#Not applicable accuracy: 1.0 (correct: 18 uncorrect: 0)
#Government accuracy: 0.8952380952380953 (correct: 94 uncorrect: 11)
#Eyewitness accuracy: 0.7822318526543879 (correct: 722 uncorrect: 201)
#Business accuracy: 1.0 (correct: 20 uncorrect: 0)
#NGOs accuracy: 0.6818181818181818 (correct: 300 uncorrect: 140)
#total accuracy: 0.7259924109748979 (correct: 9949, uncorrect: 3755 total: 13704 13704

# C=2.2 SVC OVR lemma, without stopwords, without duplicates, without zero vectors without init_sims
#Outsiders accuracy: 0.8353099127321548 (correct: 3733 uncorrect: 736)
#Media accuracy: 0.7598373747640482 (correct: 5233 uncorrect: 1654)
#Not applicable accuracy: 1.0 (correct: 66 uncorrect: 0)
#Government accuracy: 0.963076923076923 (correct: 313 uncorrect: 12)
#Eyewitness accuracy: 0.9031365313653137 (correct: 979 uncorrect: 105)
#Business accuracy: 0.9602649006622517 (correct: 145 uncorrect: 6)
#NGOs accuracy: 0.8706099815157117 (correct: 471 uncorrect: 70)
#total accuracy: 0.8089920875545367 (correct: 10940, uncorrect: 2583 total: 13523 13523

Outsiders accuracy: 0.830343300110742 (correct: 3749 uncorrect: 766)
Media accuracy: 0.761189677795597 (correct: 5221 uncorrect: 1638)
Not applicable accuracy: 0.9833333333333333 (correct: 59 uncorrect: 1)
Government accuracy: 0.9626168224299065 (correct: 309 uncorrect: 12)
Eyewitness accuracy: 0.8977900552486188 (correct: 975 uncorrect: 111)
Business accuracy: 0.9642857142857143 (correct: 135 uncorrect: 5)
NGOs accuracy: 0.8763837638376384 (correct: 475 uncorrect: 67)
total accuracy: 0.8077349700510241 (correct: 10923, uncorrect: 2600 total: 13523 13523


In [16]:
# accuracy for all classses - impostors
for unique in df_en_inf_source.inf_source.unique():
    #correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_onevsrest_gensim==unique)])
    #uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_onevsrest_gensim!=unique)])
    #correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma==unique)])
    #uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma!=unique)])
    #correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma_stop==unique)])
    #uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_gensim_lemma_stop!=unique)])
    correct = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_ovr_gensim_lemma==unique)])
    uncorrect = len(df_en_inf_source[(df_en_inf_source.inf_source==unique) & (df_en_inf_source.inf_source_predict_svc_ovr_gensim_lemma!=unique)])
    
    print("{} accuracy: {} (correct: {} uncorrect: {})".format(unique,correct/(correct+uncorrect),correct,uncorrect))
#print("total: {}".format(len(df_en_inf_source.inf_source)))    

# svc C:1.7 full nodup nolemma
#Outsiders accuracy: 0.7646083713220058 (correct: 3690 uncorrect: 1136)
#Media accuracy: 0.8944337811900192 (correct: 5126 uncorrect: 605)
#Not applicable accuracy: 0.10135135135135136 (correct: 15 uncorrect: 133)
#Government accuracy: 0.13636363636363635 (correct: 99 uncorrect: 627)
#Eyewitness accuracy: 0.5300586510263929 (correct: 723 uncorrect: 641)
#Business accuracy: 0.11196911196911197 (correct: 29 uncorrect: 230)
#NGOs accuracy: 0.4726134585289515 (correct: 302 uncorrect: 337)

# lemma without stop words
#Outsiders accuracy: 0.751473063973064 (correct: 3571 uncorrect: 1181)
#Media accuracy: 0.9030666196686641 (correct: 5124 uncorrect: 550)
#Not applicable accuracy: 0.1360544217687075 (correct: 20 uncorrect: 127)
#Government accuracy: 0.1678224687933426 (correct: 121 uncorrect: 600)
#Eyewitness accuracy: 0.6048925129725723 (correct: 816 uncorrect: 533)
#Business accuracy: 0.12992125984251968 (correct: 33 uncorrect: 221)
#NGOs accuracy: 0.5473186119873817 (correct: 347 uncorrect: 287)

# lemma with stop words
#Outsiders accuracy: 0.7635835752799668 (correct: 3682 uncorrect: 1140)
#Media accuracy: 0.8903012362876546 (correct: 5113 uncorrect: 630)
#Not applicable accuracy: 0.12162162162162163 (correct: 18 uncorrect: 130)
#Government accuracy: 0.1289437585733882 (correct: 94 uncorrect: 635)
#Eyewitness accuracy: 0.5293255131964809 (correct: 722 uncorrect: 642)
#Business accuracy: 0.07722007722007722 (correct: 20 uncorrect: 239)
#NGOs accuracy: 0.4694835680751174 (correct: 300 uncorrect: 339)

# C=2.2 SVC OVR lemma, without stopwords, without duplicates, without zero vectors without init_sims
#Outsiders accuracy: 0.7853986955606985 (correct: 3733 uncorrect: 1020)
#Media accuracy: 0.9235792446170138 (correct: 5233 uncorrect: 433)
#Not applicable accuracy: 0.4489795918367347 (correct: 66 uncorrect: 81)
#Government accuracy: 0.438375350140056 (correct: 313 uncorrect: 401)
#Eyewitness accuracy: 0.7230428360413589 (correct: 979 uncorrect: 375)
#Business accuracy: 0.5753968253968254 (correct: 145 uncorrect: 107)
#NGOs accuracy: 0.7394034536891679 (correct: 471 uncorrect: 166)

Outsiders accuracy: 0.7887649905322954 (correct: 3749 uncorrect: 1004)
Media accuracy: 0.9214613483939287 (correct: 5221 uncorrect: 445)
Not applicable accuracy: 0.4013605442176871 (correct: 59 uncorrect: 88)
Government accuracy: 0.4327731092436975 (correct: 309 uncorrect: 405)
Eyewitness accuracy: 0.7200886262924667 (correct: 975 uncorrect: 379)
Business accuracy: 0.5357142857142857 (correct: 135 uncorrect: 117)
NGOs accuracy: 0.7456828885400314 (correct: 475 uncorrect: 162)


In [17]:
# save the model to disk
filename = 'CrisisLex-inf_source.model'
pickle.dump(clf_ovr, open(filename, 'wb'))

In [16]:
# load the model to disk
filename = 'CrisisLex-inf_source.model'
clf_ovr = pickle.load(open(filename, 'rb'))

In [18]:
texts = [
    "Report: Strong quake off Guatemala is felt in Mexico City - http://t.co/fpjpvqES http://t.co/Y23hxGaF",
    "thank you for all your concerns earthquake was scary but thankfully everyone praying for italy",
    "we have water outside",
    "help me! my house is destroyed i don't have roof",
    "newspaper informs about earthquake in the ocean",
    "cannot access my car",
    ""
]
model_clf = clf_ovr
for text in texts:
    print(text)
    result = list(model_clf.predict_proba(np.array(get_doc_vector(tweet_parser_lemma(text))).reshape(1, -1)))[0]
    zipped = zip(clf_ovr.classes_, result)
    zipped = sorted(zipped, key=lambda x: x[1],reverse=True)
    for i,(cl,prob) in enumerate(zipped):
        print ("{} : {:.2f}".format(cl,prob))
    print()    
    


Report: Strong quake off Guatemala is felt in Mexico City - http://t.co/fpjpvqES http://t.co/Y23hxGaF
Media : 0.67
Outsiders : 0.20
Eyewitness : 0.08
Government : 0.02
Not applicable : 0.01
Business : 0.01
NGOs : 0.01

thank you for all your concerns earthquake was scary but thankfully everyone praying for italy
Outsiders : 0.76
Eyewitness : 0.14
Media : 0.06
NGOs : 0.01
Not applicable : 0.01
Government : 0.01
Business : 0.01

we have water outside
Eyewitness : 0.52
Media : 0.21
Outsiders : 0.13
NGOs : 0.10
Government : 0.03
Not applicable : 0.01
Business : 0.01

help me! my house is destroyed i don't have roof
Eyewitness : 0.63
Outsiders : 0.13
Media : 0.11
Government : 0.06
NGOs : 0.04
Business : 0.02
Not applicable : 0.01

newspaper informs about earthquake in the ocean
Media : 0.56
Outsiders : 0.26
Eyewitness : 0.09
Business : 0.05
NGOs : 0.02
Not applicable : 0.02
Government : 0.01

cannot access my car
Media : 0.41
Outsiders : 0.22
Government : 0.17
Business : 0.08
NGOs : 0.06
Ey

In [21]:
test_tweets = [
    ('Media', 'Report: Strong quake off Guatemala is felt in Mexico City - http://t.co/fpjpvqES http://t.co/Y23hxGaF'),
    ('Government', 'RT @CalgaryPolice: Check here to see what areas of Montgomery must be evacuated: http://t.co/0p1PSfn001 #yycflood'),
    ('Outsiders', 'RT @willienelson: West has been in my backyard all my life.  My heart is praying for the community that we call home. #westtx'),
    ('Eyewitness', '@IdilioMon: #coloradoflood ==&gt; Street view from my home this afternoon. Rain, rain, go away... http://t.co/H3FdI3A1NQ'),
    ('NGOs', 'GLOBE SUBS: Donate to Red Cross now. Text RED &lt;amt&gt; to 2899. Denoms. in 5, 25, 50, 100, 300, 500 and 1000. #YolandaPH #ReliefPH'),
    ('Not applicable', 'RT @joeshowradio: @joerogan The Bill of Rights was written for Dzhokar Tsarnaev: http://t.co/bmZUZRg4U4'),
    ('Business', 'Hyundai, Mazda and Ford offering support to victims of Colorado flooding - Autoblog http://t.co/wPAIDzT2UV')
]

model_clf = clf_ovr

for (cl, text) in test_tweets:
    #result = tp.classify_tweet(text)
    result = list(model_clf.predict_proba(np.array(get_doc_vector(tweet_parser_lemma(text))).reshape(1, -1)))[0]
    zipped = zip(clf_ovr.classes_, result)
    zipped = sorted(zipped, key=lambda x: x[1],reverse=True)
    result_class = zipped[0][0]
    result_class_probability = zipped[0][1]
    print(text)
    print(cl, result_class , result_class_probability, "\n")
    assert cl == result_class
    assert result_class_probability > 0.5

Report: Strong quake off Guatemala is felt in Mexico City - http://t.co/fpjpvqES http://t.co/Y23hxGaF
Media Media 0.6673141376920387 

RT @CalgaryPolice: Check here to see what areas of Montgomery must be evacuated: http://t.co/0p1PSfn001 #yycflood
Government Government 0.6067768923190645 

RT @willienelson: West has been in my backyard all my life.  My heart is praying for the community that we call home. #westtx
Outsiders Outsiders 0.8088413575174989 

@IdilioMon: #coloradoflood ==&gt; Street view from my home this afternoon. Rain, rain, go away... http://t.co/H3FdI3A1NQ
Eyewitness Eyewitness 0.6075594921350587 

GLOBE SUBS: Donate to Red Cross now. Text RED &lt;amt&gt; to 2899. Denoms. in 5, 25, 50, 100, 300, 500 and 1000. #YolandaPH #ReliefPH
NGOs NGOs 0.7646046302386884 

RT @joeshowradio: @joerogan The Bill of Rights was written for Dzhokar Tsarnaev: http://t.co/bmZUZRg4U4
Not applicable Not applicable 0.6434451507343469 

Hyundai, Mazda and Ford offering support to victims of Co