In [1]:
import json
import pandas as pd
import nltk 
import time

In [2]:
nltk.download('stopwords')
pd.set_option('display.max_colwidth', 0)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jfilardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading the data

In [3]:
with open("./tweets_formated.json") as fd:
    data = json.loads(fd.read())

# with open("./subset.json") as fd:
#     data = json.loads(fd.read())

df = pd.DataFrame.from_dict(data)
df["text"] = [entry["text"] for entry in df.tweet]

all_hashtags = []

for entry in df.tweet:
    hashtags_list = entry['entities']['hashtags']
    hashtags = [hashtag['text'] for hashtag in hashtags_list]
    all_hashtags.append(' '.join(hashtags))

df['hashtags'] = all_hashtags

data = df[['text','hashtags','classification']]

In [4]:
len(data)

6694

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

## Let's take a look at the text data 

In [6]:
data[10:20]

Unnamed: 0,text,hashtags,classification
10,threatmeter: Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier/RunType OOB r… https://t.co/mySKgq1k4t,,1
11,#vulnerability #security : Vuln: Microsoft Windows Kernel 'Win32k.sys' CVE-2016-7255 Local Privilege Escalation Vu… https://t.co/CjMlujGqvh,vulnerability security,1
12,[webapps] - SAP NetWeaver AS JAVA - 'BC-BMT-BPM-DSK' XML External Entity Injection https://t.co/IWj0Zo2Kwg,,1
13,Vuln: Adobe Flash Player APSB16-37 Multiple Remote Code Execution Vulnerabilities https://t.co/9aziDgJS1V,,1
14,Anonymous website wants John McAfee to be Donald Trump's #cybersecurity adviser https://t.co/UPoesDze7p,cybersecurity,-1
15,threatmeter: Vuln: Microsoft Windows Kernel ‘Win32k.sys’ CVE-2016-7255 Local Privilege Escalation Vulnerability https://t.co/piENoScyHn,,1
16,"@HumbleEinstein Are you still experiencing this issue? If yes, try with a new Chrome profile &amp; keep us posted: https://t.co/mrcdotOseh.",,-1
17,Vuln: Oracle Java SE CVE-2016-5554 Remote Security Vulnerability https://t.co/zNOUzYc7uT\n\nVulnerable:\n\nOracle JRE(Windows Production Relea…,,1
18,"@Geff_Rock @iamShoZo @SnazzyQ We knew it... :) Currently, we're built just for Windows OS.",,-1
19,#0daytoday #Microsoft Internet Explorer 8 MSHTML - &amp;#039;Ptls5::Ls­Find­Span­Visual­Boundar [#0day #Exploit] https://t.co/poovEzpfr6,0daytoday Microsoft 0day Exploit,1


#### Split between train and test data, and separate between 'positive' (relevant) and 'negative' (irrelevant) 

In [7]:
# Splitting the dataset into train and test set
train, test = train_test_split(data,test_size = 0.1)

# Cybersec-relevant tweets
train_pos = train[ train['classification'] == 1]
train_pos = train_pos['text']

test_pos = test[ test['classification'] == 1]
test_pos = test_pos['text']

# Cybersec-irrelevant tweets
train_neg = train[ train['classification'] == -1]
train_neg = train_neg['text']

test_neg = test[ test['classification'] == -1]
test_neg = test_neg['text']


#### Preprocess text: remove stopwords, hashtags, mentions, &c

In [8]:
tweets = []
stopwords_set = set(stopwords.words("english"))

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
#     words_cleaned = words_without_stopwords # Uncomment to remove stopwords from data
#     words_cleaned.append(row.hashtags) # Uncomment to add hashtags to data
    tweets.append((words_cleaned,row.classification))

## Extracting word features

In [9]:
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features

w_features = get_word_features(get_words_in_tweets(tweets))

def extract_features(document): # document = tweet1
    document_words = set(document)
    features = {}
    
    for word in w_features:
        features['contains({})'.format(word)] = (word in document_words) # is the word in this tweet?
    return features

## Using a Naive Bayes Classifier

#### Extract features from the tweets, then train the classifier 

In [10]:
%%time
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

CPU times: user 1min 29s, sys: 19.6 ms, total: 1min 29s
Wall time: 1min 29s


In [11]:
tweet = 'Cibersecurity java cve toolkit bootstrap'
res =  'relevant' if classifier.classify(extract_features(tweet.lower().split())) == 1 else 'irrelevant'
print('The tweet is {}.'.format(res))

The tweet is relevant.


#### Classify the tweets on the test_set

In [12]:
%%time

neg_cnt = 0
pos_cnt = 0

for tweet in test_neg:
    res =  classifier.classify(extract_features(tweet.split()))
    if(res == -1):
        neg_cnt = neg_cnt + 1
        
for tweet in test_pos: 
    res =  classifier.classify(extract_features(tweet.split()))
    if(res == 1): 
        pos_cnt = pos_cnt + 1
        
print('[Negative]: Predicted: %s / Actual %s '  % (neg_cnt, len(test_neg)))        
print('[Positive]: Predicted: %s / Actual %s '  % (pos_cnt, len(test_pos)))

tpr = pos_cnt/len(test_pos)
tnr = neg_cnt/len(test_neg)

print('\nTrue positive rate:', tpr)
print('True negative rate:', tnr)

[Negative]: Predicted: 204 / Actual 316 
[Positive]: Predicted: 326 / Actual 354 

True positive rate: 0.9209039548022598
True negative rate: 0.6455696202531646
CPU times: user 21.1 s, sys: 3.23 ms, total: 21.1 s
Wall time: 21.2 s


## Testing results on subset.json
------------------------------------
### Not cleaning stopwords:

#### Disregarding hashtags:
True positive rate: 0.9382716049382716 <br>
True negative rate: 0.7101449275362319

#### Using hashtags:
True positive rate: 0.8529411764705882 <br>
True negative rate: 0.5975609756097561

> Wow that was disappointing


### Cleaning stopwords:

#### Disregarding hashtags:

True positive rate: 0.5844155844155844 <br>
True negative rate: 0.9452054794520548

#### Using hashtags:
True positive rate: 0.6966292134831461 <br>
True negative rate: 0.9016393442622951


## Testing results on the whole dataset
----------------------------

### With stopwords:

#### Disregarding hashtags:
True positive rate: 0.9344262295081968 <br>
True negative rate: 0.6480263157894737

#### Using hashtags:
True positive rate: 0.8072625698324022 <br>
True negative rate: 0.7147435897435898

> Wow that looks disappointing


### Without stopwords:

#### Disregarding hashtags:
True positive rate: 0.6109589041095891 <br> 
True negative rate: 0.8918032786885246

#### Using hashtags:
True positive rate: 0.684931506849315 <br>
True negative rate: 0.8885245901639345

In [13]:
classifier.show_most_informative_features()

Most Informative Features
          contains(what) = True               -1 : 1      =     63.5 : 1.0
         contains(adobe) = True                1 : -1     =     52.6 : 1.0
           contains(hat) = True                1 : -1     =     49.6 : 1.0
           contains(you) = True               -1 : 1      =     40.8 : 1.0
          contains(suse) = True                1 : -1     =     39.8 : 1.0
       contains(chrome?) = True               -1 : 1      =     39.5 : 1.0
       contains(website) = True               -1 : 1      =     38.1 : 1.0
       contains(acrobat) = True                1 : -1     =     36.5 : 1.0
        contains(player) = True                1 : -1     =     35.7 : 1.0
          contains(rhel) = True                1 : -1     =     34.8 : 1.0


>For instance, "contains(thanks) = True  (-1 : 1)     =     (28.2 : 1.0)" means that 
a tweet that contains *thanks* is *28.2* times more likely to be *irrelevant* that *relevant*.

## Using a Support Vector Machine  

In [14]:
from sklearn import svm

In [15]:
%%time

X = np.empty((len(training_set), len(training_set[0][0])), dtype=np.int8)
Y = np.empty((len(training_set), 1), dtype=np.int8)

for i in range(X.shape[0]):
    values = list(training_set[i][0].values())
    for j in range(X.shape[1]):
        X[i,j] = 1 if values[j] else 0
    Y[i] = training_set[i][1]

CPU times: user 33.4 s, sys: 6.73 ms, total: 33.4 s
Wall time: 33.5 s


In [16]:
X.shape

(6024, 7503)

- X is a matrix of shape \[m,n\] where:
  + m is the size of the training set, i.e. how many tweets
  + n is the number of different words in the corpus
- Y is a column \[m,1\] with the label for each tweet (1 or -1)
- Any given row on X is a tweet. The columns in the row will be:
  + 1 if the nth word is found in the mth tweet
  + 0 otherwise

In [17]:
tweet = 'threatmeter: #0daytoday #Oracle OpenJDK Runtime Environment Build 1.8.0_112-b15 Denial Of Service Exploit [#0day #… https://t.co/FZwU4Iu1PK'

def generate_vector(document):
    """Generates a vector of size [1,n] which will have 1s or 0s according
    to the presence or absence of a given word in the document.
    """
    x = np.empty((1, len(training_set[0][0])), dtype=np.int8)
    test_sample = list(extract_features(document.lower().split()).values())
    for i in range(x.shape[1]):
        x[0,i] = 1 if test_sample[i] else 0
    return x

In [18]:
t0 = time.time()

# Create a linear support vector machine
lin_clf = svm.LinearSVC()
# And fit the model (training stage)
lin_clf.fit(X, Y)

t1 = time.time()
print('Time to fit SVM:', t1-t0)

Time to fit SVM: 0.22058868408203125


  y = column_or_1d(y, warn=True)


In [19]:
def classify_with_svm(test_pos, test_neg, lin_clf):
    t0 = time.time()

    neg_cnt = 0
    pos_cnt = 0
    for obj in test_neg: 
        x = generate_vector(obj)
        res = lin_clf.predict(x)
        if(res[0] < 0.5): 
            neg_cnt = neg_cnt + 1
    for obj in test_pos: 
        x = generate_vector(obj)
        res = lin_clf.predict(x)
        if(res[0] > 0.5): 
            pos_cnt = pos_cnt + 1

    t1 = time.time()

    print('[Negative]: Predicted: %s / Actual %s '  % (neg_cnt, len(test_neg)))        
    print('[Positive]: Predicted: %s / Actual %s '  % (pos_cnt, len(test_pos)))   

    tpr = pos_cnt/len(test_pos)
    tnr = neg_cnt/len(test_neg)

    print('\nTrue positive rate:', tpr)
    print('True negative rate:', tnr)

    print('\nTime to classify:', t1-t0)
    

classify_with_svm(test_pos, test_neg, lin_clf)

[Negative]: Predicted: 298 / Actual 316 
[Positive]: Predicted: 339 / Actual 354 

True positive rate: 0.9576271186440678
True negative rate: 0.9430379746835443

Time to classify: 3.072890043258667


In [20]:
tweet = '@MC_Odd Hmm. Can you double-check your location settings in Chrome: https://t.co/7fJod182X5? Keep us posted.'
x = generate_vector(tweet)

lin_clf.predict(x)

array([-1], dtype=int8)

In [21]:
tweet = 'Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier/RunType OOB read details https://t.co/vo81vJY8Pf'
x = generate_vector(tweet)

lin_clf.predict(x)

array([1], dtype=int8)

## On the actual evaluation set
#### Let's first turn the tsv into a csv

In [22]:
import re

In [68]:
es_df = pd.read_csv('ES_full.tsv', sep='\t')
es_df

Unnamed: 0,classification,timestamp,text,tweet_id,tweeter_id
0,0,Sat Jan 28 00:19:27 +0000 2017,threatmeter: [webapps] - WordPress Plugin Online Hotel Booking System Pro 1.0 - SQL Injection https://t.co/ukfA1AMeoG,825136169210015744,43
1,0,Sat Jan 28 15:45:06 +0000 2017,Persistent Cross-Site Scripting #vulnerability in User Access Manager #WordPress Plugin https://t.co/3XXi4WI5QR #FullDisclosure,825369115984408577,25
2,0,Sat Jan 28 00:13:10 +0000 2017,WPBeginner Glossary - What is a Pingback in #WordPress - https://t.co/cow9wipDG9,825134585214275585,75
3,0,Sat Jan 28 03:21:17 +0000 2017,#cybersecurity Your Heartbeat May Soon Become Your Password https://t.co/SUsjOnTwqF #infosec,825181929649565696,33
4,0,Sat Jan 28 06:07:53 +0000 2017,"WordPress Releases Security Update: Original release date: January 26, 2017 WordPress 4.7.1 and prior… https://t.co/oVk1EJ1RJG #infosec",825223854578614272,48
...,...,...,...,...,...
5270,1,Thu Mar 16 17:15:07 +0000 2017,RHEL 7 : policycoreutils (RHSA-2017:0536) https://t.co/SKEHeFlLEc #Nessus,842423997241278464,25
5271,1,Thu Mar 16 17:18:31 +0000 2017,threatmeter: Ubuntu 14.04 LTS : linux-lts-xenial vulnerabilities (USN-3234-2) https://t.co/ZQ67NqCEJ0,842424852606787585,43
5272,1,Thu Mar 16 09:11:40 +0000 2017,"Pwn2Own 2017: Experts Hack Edge, Safari, Ubuntu: Bug bounty hunters have managed to hack Microsoft Edge… https://t.co/ankIYfSqJr #infosec",842302335929991168,48
5273,0,Thu Mar 16 17:15:06 +0000 2017,#Fedora 25 : 1:qbittorrent (2017-b59943dcae) https://t.co/Mynw64rGYr #Nessus,842423996448620544,25


In [24]:
## Let's prepare our training sets
# Cybersec-relevant tweets
test_pos = es_df[es_df['classification'] == 1]
test_pos = test_pos['text']

# Cybersec-irrelevant tweets
test_neg = es_df[ es_df['classification'] == 0] # for some reason now it's 0
test_neg = test_neg['text']

In [25]:
classify_with_svm(test_pos, test_neg, lin_clf)

[Negative]: Predicted: 3097 / Actual 3422 
[Positive]: Predicted: 1607 / Actual 1853 

True positive rate: 0.8672423097679439
True negative rate: 0.9050263004091175

Time to classify: 26.648561000823975


### Let's get some metrics

In [69]:
from sklearn.metrics import precision_recall_fscore_support

In [72]:
es_df["clean_text"] = es_df["text"]

In [73]:
for index, tweet in es_df["text"].iteritems():
    words_filtered = [e.lower() for e in tweet.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    es_df["clean_text"].loc[index] = ' '.join(words_cleaned)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [76]:
es_df["clean_text"]

0       threatmeter: [webapps] wordpress plugin online hotel booking system pro 1.0 sql injection             
1       persistent cross-site scripting user access manager plugin                                            
2       wpbeginner glossary what pingback                                                                     
3       your heartbeat may soon become your password                                                          
4       wordpress releases security update: original release date: january 26, 2017 wordpress 4.7.1 and prior…
                                                         ...                                                  
5270    rhel policycoreutils (rhsa-2017:0536)                                                                 
5271    threatmeter: ubuntu 14.04 lts linux-lts-xenial vulnerabilities (usn-3234-2)                           
5272    pwn2own 2017: experts hack edge, safari, ubuntu: bug bounty hunters have managed hack microsoft edge… 
5

In [77]:
es_df["classification"] = es_df["classification"].apply(lambda x: x if x==1 else -1)

In [79]:
%%time
predictions = [lin_clf.predict(generate_vector(tw))[0] for tw in es_df["text"]]
clean_predictions = [lin_clf.predict(generate_vector(tw))[0] for tw in es_df["clean_text"]]

CPU times: user 49.6 s, sys: 16.7 ms, total: 49.6 s
Wall time: 49.6 s


In [80]:
precision_recall_fscore_support(es_df["classification"], predictions)

(array([0.9264134 , 0.83178054]),
 array([0.9050263 , 0.86724231]),
 array([0.91559497, 0.84914135]),
 array([3422, 1853]))

In [81]:
precision_recall_fscore_support(es_df["classification"], clean_predictions)

(array([0.9264134 , 0.83178054]),
 array([0.9050263 , 0.86724231]),
 array([0.91559497, 0.84914135]),
 array([3422, 1853]))