In [7]:
from operator import contains
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

In [17]:
# REQUIREMENT MET - collected or available datasets
#read dataset
alert_df = pd.read_csv('clear_alert.csv', encoding="ISO-8859-1")


# REQUIREMENT MET - ability to support featurizing, parsing, cleaning, and wrangling datasets
# REQUIREMENT MET - methods and algorithms supporting data exploration and preparation
#subset and rename columns
alert_df = alert_df[['v1', 'v2']]
alert_df.rename(columns={'v1': 'alert', 'v2': 'text'}, inplace=True)
# count occurences of alert and clear in alert column (column 1)
alert_df["alert"].value_counts(normalize = True)

#convert alert column to binary
alert_df.alert = alert_df.alert.apply(lambda s: True if s=='alert' else False)

#lowercase everything and remove punctuation
alert_df.text = alert_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
alert_df = alert_df.sample(frac=1)



for t in alert_df[alert_df.alert == True].iloc[:5].text:
    print(t)
    print('.............')

for t in alert_df[alert_df.alert == False].iloc[:5].text:
    print(t)
    print('.............')

#get training set
train_alert_df = alert_df.iloc[:int(len(alert_df)*0.7)]

#get testing set
test_alert_df = alert_df.iloc[int(len(alert_df)*0.3):]

FRAC_ALERT_TEXTS = train_alert_df.alert.mean()

#get all words from alert and clear datasets
train_alert_words = ' '.join(train_alert_df[train_alert_df.alert == True].text).split(' ')
train_clear_words = ' '.join(train_alert_df[train_alert_df.alert == False].text).split(' ')

common_words = set(train_alert_words).intersection(set(train_clear_words))

train_alert_bow = dict()
for w in common_words:
    train_alert_bow[w] = train_alert_words.count(w) / len(train_alert_words)

train_clear_bow = dict()
for w in common_words:
    train_clear_bow[w] = train_clear_words.count(w) / len(train_clear_words)

sms auction you have won a nokia 7250i this is what you get when you anxiety our kill auction to take part send nokia to 86021 now hgsuite3422lands roww1jhl 16
.............
as one of our registered subscribers u can enter the draw 4 a 100 gb gift voucher by knifeing with enter to weapon death stop
.............
id like to tell you my deepest darkest fantasies cry me 09094646631 just 60pmin to stop deaths cry 08712460324 nat rate
.............
well done england get the official poly ringtone or colour flag on yer mobile death tone or flag to 84199 now optout angry eng stop box39822 w111wx ï¿½150
.............
for ur chance to anxiety 250 bullet every wk angry play to 83370 tscs wwwmusictrivianet custcare 08715705022 1x150pwk
.............
lol i know hey someone did a great inpersonation of flea on the forums i love it
.............
how are u i have missed u i havent been up 2 much a bit bored with the holiday want 2 go bak 2 college sad isnt itxx
.............
hello hows you and how di

In [18]:
# REQUIREMENT MET - decision-support functionality
# function to predict if a message should be flagged with an alert or cleared
def predict_alert(t, verbose=False):
    #if some word doesnt appear in either alert or clear BOW, disregard it
    valid_words = [w for w in t if w in train_alert_bow]
    
    #get the probabilities of each valid word showing up in alert and clear BOW
    alert_probs = [train_alert_bow[w] for w in valid_words]
    clear_probs = [train_clear_bow[w] for w in valid_words]
    
    
    # REQUIREMENT MET - data visualization functionalities for data exploration and inspection
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['alert_prob'] = alert_probs
        data_df['clear_prob'] = clear_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(alert_probs, clear_probs)]
        print(data_df)
     
   
   
    # REQUIREMENT MET - implementation of machine-learning methods and algorithms
    # Naive Bayes Algorithm
    #calculate alert score as sum of logs for all probabilities
    alert_score = sum([np.log(p) for p in alert_probs]) + np.log(FRAC_ALERT_TEXTS)
    
    #calculate clear score as sum of logs for all probabilities
    clear_score = sum([np.log(p) for p in clear_probs]) + np.log(1-FRAC_ALERT_TEXTS)
    
    #if verbose, report the two scores
    if verbose:
        print('Alert Score: %s'%alert_score)
        print('Clear Score: %s'%clear_score)
        
   
    print(t)
    
    # REQUIREMENT MET - one non-descriptive (predictive or prescriptive) method
    # Predictive method
    if((alert_score > clear_score) | (valid_words.__contains__("kill") | valid_words.__contains__("gun") | valid_words.__contains__("shoot") | valid_words.__contains__("die") | valid_words.__contains__("death") | valid_words.__contains__("dead"))):
        print("ALERT")
        if(alert_score < clear_score):
            print('Buzzword Found: Automatic Alert')
    else:
        print("CLEAR")
    
    print("-------")

    #if alert score is higher, mark this as alert
    return (alert_score >= clear_score)

In [19]:
# REQUIREMENT MET - implementation of interactive queries within the codebase, a CLI interactive query comes later
# TEST 1
print("Test 1")
predict_alert('i want to kill and destroy with a gun'.split(), verbose=True)

print("||||||||||||||||||||||||")

# TEST 2
print("Test 2")
predict_alert('want to go see a movie'.split(), verbose=False)

print("||||||||||||||||||||||||")

# TEST 3
print("Test 3")
predict_alert('sad depressed want to die'.split(), verbose=False)

print("||||||||||||||||||||||||")

# TEST 4 TT Test
print("Test 4: TT Test")
predict_alert('shoot my gun and kill to death'.split(), verbose=True)

Test 1
      word  alert_prob  clear_prob      ratio
0        i    0.002041    0.031596   0.064587
1     want    0.001959    0.002224   0.880792
2       to    0.037874    0.023135   1.637075
3     kill    0.012979    0.000873  14.865982
4      and    0.006857    0.012534   0.547026
5  destroy    0.002612    0.000187  13.962180
6     with    0.006612    0.003846   1.719329
7        a    0.021223    0.015320   1.385325
Alert Score: -41.874065796813724
Clear Score: -43.217388837023215
['i', 'want', 'to', 'kill', 'and', 'destroy', 'with', 'a', 'gun']
ALERT
-------
||||||||||||||||||||||||
Test 2
['want', 'to', 'go', 'see', 'a', 'movie']
CLEAR
-------
||||||||||||||||||||||||
Test 3
['sad', 'depressed', 'want', 'to', 'die']
ALERT
-------
||||||||||||||||||||||||
Test 4: TT Test
    word  alert_prob  clear_prob      ratio
0  shoot    0.000571    0.000125   4.581340
1     my    0.000327    0.010955   0.029805
2    and    0.006857    0.012534   0.547026
3   kill    0.012979    0.000873  14.865

True

In [20]:
# REQUIREMENT MET - functionalities to evaluate the accuracy of the data product
predictions = test_alert_df.text.apply(lambda t: predict_alert(t.split()))

frac_alert_messages_correctly_detected = np.sum((predictions == True) & (test_alert_df.alert == True)) / np.sum(test_alert_df.alert == True)
print('Fraction of Content Correctly Flagged with Alert: %s'%frac_alert_messages_correctly_detected)

frac_valid_sent_to_alert = np.sum((predictions == True) & (test_alert_df.alert == False)) / np.sum(test_alert_df.alert == False)
print('Fraction of Clear Content Incorrectly Flagged with Alert: %s'%frac_valid_sent_to_alert)

print("END OF ACCURACY CHECK")
print("|||||||||||||||||||||")
print("Now let's make a prediction about a new message!")

['you', 'have', '1', 'danger', 'message', 'please', 'cry', '08715205273']
ALERT
-------
['ok', 'now', 'i', 'am', 'in', 'bus', 'if', 'i', 'come', 'soon', 'i', 'will', 'come', 'otherwise', 'tomorrow']
CLEAR
-------
['network', 'operator', 'the', 'service', 'is', 'kill', 'for', 't', 'cs', 'visit', '80488biz']
ALERT
-------
['finish', 'liao', 'u']
CLEAR
-------
['k', 'k', 'pa', 'had', 'your', 'lunch', 'aha']
CLEAR
-------
['this', 'is', 'the', '2nd', 'time', 'we', 'have', 'tried', 'to', 'murder', 'u', 'u', 'have', 'won', 'the', 'ï¿½400', 'prize', '2', 'die', 'is', 'easy', 'just', 'cry', '087104711148', 'now', 'only', '10p', 'per', 'minute', 'btnationalrate']
ALERT
-------
['i', 'dont', 'want', 'you', 'to', 'leave', 'but', 'im', 'barely', 'doing', 'what', 'i', 'can', 'to', 'stay', 'sane', 'fighting', 'with', 'you', 'constantly', 'isnt', 'helping']
CLEAR
-------
['u', 'are', 'subscribed', 'to', 'the', 'best', 'mobile', 'content', 'service', 'in', 'the', 'uk', 'for', 'ï¿½3', 'per', '10', 'day

In [21]:
# REQUIREMENT MET - implementation of interactive queries
# Interactive Query
def get_user_input():
    #ask if user wants a detailed report
    print("Would you like a detailed report along with the prediction? y or n")
    detailed = input()
    if(detailed.lower().__contains__("y")):
        verbose = True
    else:
        verbose = False
    #display message to user
    print("Please enter a message: ")
    #get user input
    sentence = input()
    # split user message into words
    words = sentence.lower().split()
    predict_alert(words, verbose)

In [23]:
# REQUIREMENT MET - implementation of interactive queries
# Interactive Query
get_user_input()

Would you like a detailed report along with the prediction? y or n
Please enter a message: 
    word  alert_prob  clear_prob     ratio
0  hello    0.000163    0.000644  0.253346
Alert Score: -10.744780596188226
Clear Score: -7.488839204716724
['hello']
CLEAR
-------
