In [162]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

# Clean and load data

In [202]:
def build_dataset():
    # ISIS-affiliated tweets dataset
    isis_df = pd.read_csv('isis_tweets.csv', sep=',')
    isis_df.drop(['name', 'username', 'location', 'followers', 'numberstatuses', 'time', 'description'], axis=1, inplace=True)
    isis_df.reset_index(drop=True, inplace=True)

    # Benign tweets dataset, truncated from 1.6M to 17k
    benign_df = pd.read_csv('benign_tweets_2.csv', sep=',', header=None, nrows=17000)
    benign_df.drop([0, 1, 2, 3, 4], axis=1, inplace=True)
    benign_df.reset_index(drop=True, inplace=True)

    # Merge data and generate labels
    data = np.concatenate((isis_df.as_matrix(), benign_df.as_matrix()))
    labels = np.hstack((np.zeros(isis_df.shape[0]), np.ones(benign_df.shape[0])))

    return data, labels

# Bernoulli-distributed Naive-Bayes method

In [224]:
def train(clf, vectorizer, X_train, y_train):
    X_train = vectorizer.fit_transform(X_train)
    clf.fit(X_train, y_train)
    return clf, vectorizer

def test(clf, vectorizer, X_test, y_test):
    X_test = vectorizer.transform(X_test)
    return clf.predict(X_test)

# For evaluation and debugging purposes
def evaluate(X_train, y_train, X_test, y_test):
    vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words='english', decode_error='ignore')
    clf = BernoulliNB()
    X_train = vectorizer.fit_transform(X_train)
    clf.fit(X_train, y_train)
    X_test = vectorizer.transform(X_test)
    score = clf.score(X_test, y_test)
    return score

# Split data, run classifier

In [227]:
data, labels = build_dataset()
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True)

X_train = X_train.transpose()[0]
X_test = X_test.transpose()[0]

evaluate(X_train, y_train, X_test, y_test)

0.95640802092415