In [1]:
import json
import pandas as pd
import nltk 
import time

In [2]:
nltk.download('stopwords')
pd.set_option('display.max_colwidth',0)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/filardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading the data

In [3]:
with open("./tweets_formated.json") as fd:
    data = json.loads(fd.read())

# with open("./subset.json") as fd:
#     data = json.loads(fd.read())

df = pd.DataFrame.from_dict(data)
df["text"] = [entry["text"] for entry in df.tweet]

all_hashtags = []

for entry in df.tweet:
    hashtags_list = entry['entities']['hashtags']
    hashtags = [hashtag['text'] for hashtag in hashtags_list]
    all_hashtags.append(' '.join(hashtags))

df['hashtags'] = all_hashtags

data = df[['text','hashtags','classification']]

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

## Let's take a look at the text data 

In [5]:
data

Unnamed: 0,text,hashtags,classification
0,[webapps] - SAP NetWeaver AS #JAVA - 'BC-BMT-BPM-DSK' XML External Entity #Injection https://t.co/QdKIyocwur #ExploitDB,JAVA Injection ExploitDB,1
1,#vulnerability #security : [webapps] - SAP NetWeaver AS JAVA - 'BC-BMT-BPM-DSK' XML External Entity Injection https://t.co/bLIiTMTbuc,vulnerability security,1
2,Vuln: Google Chrome Multiple Security Vulnerabilities https://t.co/U4or0GZirf,,1
3,Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier/RunType OOB read details https://t.co/vo81vJY8Pf,,1
4,Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier/RunType OOB read details https://t.co/JIHOzCvq5W,,1
5,@MC_Odd Hmm. Can you double-check your location settings in Chrome: https://t.co/7fJod182X5? Keep us posted.,,-1
6,threatmeter: Vuln: Microsoft Windows Kernel 'Win32k.sys' CVE-2016-7255 Local Privilege Escalation Vulnerability https://t.co/2zdn6GaGdK,,1
7,Fedora 25 Now Available -- Makes It Easier To Switch From Windows 10 Or Mac https://t.co/2FNWTOaPTd,,-1
8,#vulnerability #security : Bugtraq: CVE-2015-0050: Microsoft Internet Explorer 8 MSHTML SRunPointer::SpanQualifier… https://t.co/cYV2oV3bwY,vulnerability security,1
9,Vuln: Microsoft Windows Kernel 'Win32k.sys' CVE-2016-7255 Local Privilege Escalation Vulnerability https://t.co/XTqKDPTNK9,,1


#### Split between train and test data, and separate between 'positive' (relevant) and 'negative' (irrelevant) 

In [6]:
# Splitting the dataset into train and test set
train, test = train_test_split(data,test_size = 0.1)

# Cybersec-relevant tweets
train_pos = train[ train['classification'] == 1]
train_pos = train_pos['text']

test_pos = test[ test['classification'] == 1]
test_pos = test_pos['text']

# Cybersec-irrelevant tweets
train_neg = train[ train['classification'] == -1]
train_neg = train_neg['text']

test_neg = test[ test['classification'] == -1]
test_neg = test_neg['text']


#### Preprocess text: remove stopwords, hashtags, mentions, &c

In [7]:
tweets = []
stopwords_set = set(stopwords.words("english"))

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
#     words_cleaned = words_without_stopwords # Uncomment to remove stopwords from data
#     words_cleaned.append(row.hashtags) # Uncomment to add hashtags to data
    tweets.append((words_cleaned,row.classification))

## Extracting word features

In [8]:
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features

w_features = get_word_features(get_words_in_tweets(tweets))

def extract_features(document): # document = tweet1
    document_words = set(document)
    features = {}
    
    for word in w_features:
        features['contains({})'.format(word)] = (word in document_words) # is the word in this tweet?
    return features

## Using a Naive Bayes Classifier
> Running this will take a while (ca. 6 or 7 minutes)

#### Extract features from the tweets, then train the classifier 

In [9]:
t0 = time.time()
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
t1 = time.time()

print(t1-t0)

145.376953125


#### Classify the tweets on the test_set

In [10]:
neg_cnt = 0
pos_cnt = 0

for tweet in test_neg:
    res =  classifier.classify(extract_features(tweet.split()))
    if(res == -1):
        neg_cnt = neg_cnt + 1
        
for tweet in test_pos: 
    res =  classifier.classify(extract_features(tweet.split()))
    if(res == 1): 
        pos_cnt = pos_cnt + 1
        
print('[Negative]: Predicted: %s / Actual %s '  % (neg_cnt, len(test_neg)))        
print('[Positive]: Predicted: %s / Actual %s '  % (pos_cnt, len(test_pos)))

[Negative]: Predicted: 197 / Actual 304 
[Positive]: Predicted: 342 / Actual 366 


In [11]:
tpr = pos_cnt/len(test_pos)
tnr = neg_cnt/len(test_neg)

print('True positive rate:', tpr)
print('True negative rate:', tnr)

True positive rate: 0.9344262295081968
True negative rate: 0.6480263157894737


## Testing results on subset.json
------------------------------------
### Not cleaning stopwords:

#### Disregarding hashtags:
True positive rate: 0.9382716049382716 <br>
True negative rate: 0.7101449275362319

#### Using hashtags:
True positive rate: 0.8529411764705882 <br>
True negative rate: 0.5975609756097561

> Wow that was disappointing


### Cleaning stopwords:

#### Disregarding hashtags:

True positive rate: 0.5844155844155844 <br>
True negative rate: 0.9452054794520548

#### Using hashtags:
True positive rate: 0.6966292134831461 <br>
True negative rate: 0.9016393442622951


## Testing results on the whole dataset
----------------------------

### With stopwords:

#### Disregarding hashtags:
True positive rate: 0.9344262295081968 <br>
True negative rate: 0.6480263157894737

#### Using hashtags:
True positive rate: 0.8072625698324022 <br>
True negative rate: 0.7147435897435898

> Wow that looks disappointing


### Without stopwords:

#### Disregarding hashtags:
True positive rate: 0.6109589041095891 <br> 
True negative rate: 0.8918032786885246

#### Using hashtags:
True positive rate: 0.684931506849315 <br>
True negative rate: 0.8885245901639345

In [12]:
classifier.show_most_informative_features()

Most Informative Features
         contains(adobe) = True                1 : -1     =     54.2 : 1.0
           contains(hat) = True                1 : -1     =     52.4 : 1.0
        contains(thanks) = True               -1 : 1      =     45.4 : 1.0
        contains(there.) = True               -1 : 1      =     41.6 : 1.0
          contains(what) = True               -1 : 1      =     40.2 : 1.0
       contains(chrome?) = True               -1 : 1      =     38.4 : 1.0
          contains(rhel) = True                1 : -1     =     37.9 : 1.0
       contains(website) = True               -1 : 1      =     35.9 : 1.0
           contains(you) = True               -1 : 1      =     35.9 : 1.0
       contains(acrobat) = True                1 : -1     =     34.5 : 1.0


>For instance, "contains(thanks) = True  (-1 : 1)     =     (28.2 : 1.0)" means that 
a tweet that contains *thanks* is *28.2* times more likely to be *irrelevant* that *relevant*.