In [1]:
import json
import pandas as pd
import nltk 
import time

In [2]:
nltk.download('stopwords')
pd.set_option('display.max_colwidth', 0)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/filardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading the data

In [3]:
with open("./tweets_formated.json") as fd:
    data = json.loads(fd.read())

# with open("./subset.json") as fd:
#     data = json.loads(fd.read())

df = pd.DataFrame.from_dict(data)
df["text"] = [entry["text"] for entry in df.tweet]

all_hashtags = []

for entry in df.tweet:
    hashtags_list = entry['entities']['hashtags']
    hashtags = [hashtag['text'] for hashtag in hashtags_list]
    all_hashtags.append(' '.join(hashtags))

df['hashtags'] = all_hashtags

data = df[['text','hashtags','classification']]

In [4]:
len(data)

6694

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

## Let's take a look at the text data 

In [6]:
data

Unnamed: 0,text,hashtags,classification
0,[webapps] - SAP NetWeaver AS #JAVA - 'BC-BMT-BPM-DSK' XML External Entity #Injection https://t.co/QdKIyocwur #ExploitDB,JAVA Injection ExploitDB,1
1,#vulnerability #security : [webapps] - SAP NetWeaver AS JAVA - 'BC-BMT-BPM-DSK' XML External Entity Injection https://t.co/bLIiTMTbuc,vulnerability security,1
2,Vuln: Google Chrome Multiple Security Vulnerabilities https://t.co/U4or0GZirf,,1
3,Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier/RunType OOB read details https://t.co/vo81vJY8Pf,,1
4,Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier/RunType OOB read details https://t.co/JIHOzCvq5W,,1
5,@MC_Odd Hmm. Can you double-check your location settings in Chrome: https://t.co/7fJod182X5? Keep us posted.,,-1
6,threatmeter: Vuln: Microsoft Windows Kernel 'Win32k.sys' CVE-2016-7255 Local Privilege Escalation Vulnerability https://t.co/2zdn6GaGdK,,1
7,Fedora 25 Now Available -- Makes It Easier To Switch From Windows 10 Or Mac https://t.co/2FNWTOaPTd,,-1
8,#vulnerability #security : Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier… https://t.co/cYV2oV3bwY,vulnerability security,1
9,Vuln: Microsoft Windows Kernel 'Win32k.sys' CVE-2016-7255 Local Privilege Escalation Vulnerability https://t.co/XTqKDPTNK9,,1


#### Split between train and test data, and separate between 'positive' (relevant) and 'negative' (irrelevant) 

In [7]:
# Splitting the dataset into train and test set
train, test = train_test_split(data,test_size = 0.1)

# Cybersec-relevant tweets
train_pos = train[ train['classification'] == 1]
train_pos = train_pos['text']

test_pos = test[ test['classification'] == 1]
test_pos = test_pos['text']

# Cybersec-irrelevant tweets
train_neg = train[ train['classification'] == -1]
train_neg = train_neg['text']

test_neg = test[ test['classification'] == -1]
test_neg = test_neg['text']


#### Preprocess text: remove stopwords, hashtags, mentions, &c

In [8]:
tweets = []
stopwords_set = set(stopwords.words("english"))

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
#     words_cleaned = words_without_stopwords # Uncomment to remove stopwords from data
#     words_cleaned.append(row.hashtags) # Uncomment to add hashtags to data
    tweets.append((words_cleaned,row.classification))

## Extracting word features

In [9]:
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features

w_features = get_word_features(get_words_in_tweets(tweets))

def extract_features(document): # document = tweet1
    document_words = set(document)
    features = {}
    
    for word in w_features:
        features['contains({})'.format(word)] = (word in document_words) # is the word in this tweet?
    return features

## Using a Naive Bayes Classifier
> Running this will take a while (ca. 6 or 7 minutes)

#### Extract features from the tweets, then train the classifier 

In [10]:
t0 = time.time()
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
t1 = time.time()

print('Time to train:', t1-t0)

Time to train: 112.75204873085022


In [11]:
tweet = 'Cibersecurity java cve toolkit bootstrap'
res =  'relevant' if classifier.classify(extract_features(tweet.lower().split())) == 1 else 'irrelevant'
print('The tweet is {}.'.format(res))

The tweet is relevant.


#### Classify the tweets on the test_set

In [12]:
t0 = time.time()

neg_cnt = 0
pos_cnt = 0

for tweet in test_neg:
    res =  classifier.classify(extract_features(tweet.split()))
    if(res == -1):
        neg_cnt = neg_cnt + 1
        
for tweet in test_pos: 
    res =  classifier.classify(extract_features(tweet.split()))
    if(res == 1): 
        pos_cnt = pos_cnt + 1

t1 = time.time()
        
print('[Negative]: Predicted: %s / Actual %s '  % (neg_cnt, len(test_neg)))        
print('[Positive]: Predicted: %s / Actual %s '  % (pos_cnt, len(test_pos)))

tpr = pos_cnt/len(test_pos)
tnr = neg_cnt/len(test_neg)

print('\nTrue positive rate:', tpr)
print('True negative rate:', tnr)

print('\nTime to classify:', t1-t0)

[Negative]: Predicted: 200 / Actual 300 
[Positive]: Predicted: 348 / Actual 370 

True positive rate: 0.9405405405405406
True negative rate: 0.6666666666666666

Time to classify: 22.065096855163574


## Testing results on subset.json
------------------------------------
### Not cleaning stopwords:

#### Disregarding hashtags:
True positive rate: 0.9382716049382716 <br>
True negative rate: 0.7101449275362319

#### Using hashtags:
True positive rate: 0.8529411764705882 <br>
True negative rate: 0.5975609756097561

> Wow that was disappointing


### Cleaning stopwords:

#### Disregarding hashtags:

True positive rate: 0.5844155844155844 <br>
True negative rate: 0.9452054794520548

#### Using hashtags:
True positive rate: 0.6966292134831461 <br>
True negative rate: 0.9016393442622951


## Testing results on the whole dataset
----------------------------

### With stopwords:

#### Disregarding hashtags:
True positive rate: 0.9344262295081968 <br>
True negative rate: 0.6480263157894737

#### Using hashtags:
True positive rate: 0.8072625698324022 <br>
True negative rate: 0.7147435897435898

> Wow that looks disappointing


### Without stopwords:

#### Disregarding hashtags:
True positive rate: 0.6109589041095891 <br> 
True negative rate: 0.8918032786885246

#### Using hashtags:
True positive rate: 0.684931506849315 <br>
True negative rate: 0.8885245901639345

In [13]:
classifier.show_most_informative_features()

Most Informative Features
         contains(adobe) = True                1 : -1     =     77.5 : 1.0
           contains(hat) = True                1 : -1     =     49.1 : 1.0
        contains(there.) = True               -1 : 1      =     41.5 : 1.0
          contains(what) = True               -1 : 1      =     41.0 : 1.0
          contains(suse) = True                1 : -1     =     39.7 : 1.0
       contains(chrome?) = True               -1 : 1      =     39.1 : 1.0
          contains(rhel) = True                1 : -1     =     38.5 : 1.0
        contains(player) = True                1 : -1     =     37.3 : 1.0
       contains(acrobat) = True                1 : -1     =     36.9 : 1.0
       contains(website) = True               -1 : 1      =     36.3 : 1.0


>For instance, "contains(thanks) = True  (-1 : 1)     =     (28.2 : 1.0)" means that 
a tweet that contains *thanks* is *28.2* times more likely to be *irrelevant* that *relevant*.

## Using a Support Vector Machine  

In [14]:
from sklearn import svm

In [15]:
t0 = time.time()

X = np.empty((len(training_set), len(training_set[0][0])), dtype=np.int8)
Y = np.empty((len(training_set), 1), dtype=np.int8)

for i in range(X.shape[0]):
    values = list(training_set[i][0].values())
    for j in range(X.shape[1]):
        X[i,j] = 1 if values[j] else 0
    Y[i] = training_set[i][1]

t1 = time.time()

print('Time to generate X and Y:', t1-t0)

Time to generate X and Y: 34.09563112258911


In [16]:
X.shape

(6024, 7560)

- X is a matrix of shape \[n,m\] where:
  + n is the size of the training set, i.e. how many tweets
  + m is the number of different words in the corpus
- Y is a column \[n,1\] with the label for each tweet (1 or -1)
- Any given row on X is a tweet. The columns in the row will be:
  + 1 if the word m is found in the tweet n
  + 0 otherwise

In [17]:
tweet = 'threatmeter: #0daytoday #Oracle OpenJDK Runtime Environment Build 1.8.0_112-b15 Denial Of Service Exploit [#0day #… https://t.co/FZwU4Iu1PK'

def generate_vector(document):
    """Generates a vector of size [1,n] which will have 1s or 0s according
    to the presence or absence of a given word in the document.
    """
    x = np.empty((1, len(training_set[0][0])), dtype=np.int8)
    test_sample = list(extract_features(document.lower().split()).values())
    for i in range(x.shape[1]):
        x[0,i] = 1 if test_sample[i] else 0
    return x

In [18]:
t0 = time.time()

# Create a linear support vector machine
lin_clf = svm.LinearSVC()
# And fit the model (training stage)
lin_clf.fit(X, Y)

t1 = time.time()
print('Time to fit SVM:', t1-t0)

Time to fit SVM: 0.29629063606262207


  y = column_or_1d(y, warn=True)


In [19]:
def classify_with_svm(test_pos, test_neg, lin_clf):
    t0 = time.time()

    neg_cnt = 0
    pos_cnt = 0
    for obj in test_neg: 
        x = generate_vector(obj)
        res = lin_clf.predict(x)
        if(res[0] < 0.5): 
            neg_cnt = neg_cnt + 1
    for obj in test_pos: 
        x = generate_vector(obj)
        res = lin_clf.predict(x)
        if(res[0] > 0.5): 
            pos_cnt = pos_cnt + 1

    t1 = time.time()

    print('[Negative]: Predicted: %s / Actual %s '  % (neg_cnt, len(test_neg)))        
    print('[Positive]: Predicted: %s / Actual %s '  % (pos_cnt, len(test_pos)))   

    tpr = pos_cnt/len(test_pos)
    tnr = neg_cnt/len(test_neg)

    print('\nTrue positive rate:', tpr)
    print('True negative rate:', tnr)

    print('\nTime to classify:', t1-t0)
    

classify_with_svm(test_pos, test_neg, lin_clf)

[Negative]: Predicted: 284 / Actual 300 
[Positive]: Predicted: 351 / Actual 370 

True positive rate: 0.9486486486486486
True negative rate: 0.9466666666666667

Time to classify: 3.2379038333892822


In [20]:
tweet = '@MC_Odd Hmm. Can you double-check your location settings in Chrome: https://t.co/7fJod182X5? Keep us posted.'
x = generate_vector(tweet)

lin_clf.predict(x)

array([-1], dtype=int8)

## On the actual evaluation set
#### Let's first turn the tsv into a csv

In [21]:
import re

In [22]:
es_df = pd.read_csv('ES_full.tsv', sep='\t')
es_df

Unnamed: 0,classification,timestamp,text,tweet_id,tweeter_id
0,0,Sat Jan 28 00:19:27 +0000 2017,threatmeter: [webapps] - WordPress Plugin Online Hotel Booking System Pro 1.0 - SQL Injection https://t.co/ukfA1AMeoG,825136169210015744,43
1,0,Sat Jan 28 15:45:06 +0000 2017,Persistent Cross-Site Scripting #vulnerability in User Access Manager #WordPress Plugin https://t.co/3XXi4WI5QR #FullDisclosure,825369115984408577,25
2,0,Sat Jan 28 00:13:10 +0000 2017,WPBeginner Glossary - What is a Pingback in #WordPress - https://t.co/cow9wipDG9,825134585214275585,75
3,0,Sat Jan 28 03:21:17 +0000 2017,#cybersecurity Your Heartbeat May Soon Become Your Password https://t.co/SUsjOnTwqF #infosec,825181929649565696,33
4,0,Sat Jan 28 06:07:53 +0000 2017,"WordPress Releases Security Update: Original release date: January 26, 2017 WordPress 4.7.1 and prior… https://t.co/oVk1EJ1RJG #infosec",825223854578614272,48
5,0,Sat Jan 28 11:50:11 +0000 2017,"@wordpressdotcom @WordPress WordPress 4.7.2 release addresses XSS, SQL Injection vulnerabilities https://t.co/uMhVmgCA2s #securityaffairs",825309997181583361,30
6,0,Sat Jan 28 10:34:29 +0000 2017,"WORDPRESS 4.7.2 UPDATE FIXES XSS, SQL INJECTION BUGS https://t.co/ZpiTSE3JKQ",825290945495236608,79
7,0,Sat Jan 28 13:07:29 +0000 2017,RT @guedou: intelpt/WindowsIntelPT - driver for Intel Processor Trace functionality in Intel Skylake architecture #reconbrx https://t.co/…,825329452196433922,63
8,0,Sat Jan 28 00:19:30 +0000 2017,threatmeter: [webapps] - WordPress Plugin WP Private Messages 1.0.1 - SQL Injection https://t.co/I4vbBaW3ZF,825136180408762369,43
9,0,Sat Jan 28 13:01:20 +0000 2017,RT @FCE365: iOS 10.2 / 10.1.1 - How to Jailbreak On Windows: https://t.co/fPysusidl8 via @YouTube,825327901331566593,63


In [23]:
## Let's prepare our training sets
# Cybersec-relevant tweets
test_pos = es_df[es_df['classification'] == 1]
test_pos = test_pos['text']

# Cybersec-irrelevant tweets
test_neg = es_df[ es_df['classification'] == 0] # for some reason now it's 0
test_neg = test_neg['text']

In [25]:
classify_with_svm(test_pos, test_neg, lin_clf)

[Negative]: Predicted: 3049 / Actual 3422 
[Positive]: Predicted: 1604 / Actual 1853 

True positive rate: 0.8656233135456017
True negative rate: 0.890999415546464

Time to classify: 25.73776078224182
