In [33]:
import pandas as pd
import en_core_web_sm
from spacymoji import Emoji
import warnings
warnings.filterwarnings("ignore")

import csv
import os
import datetime

from joblib import load
import plotly.express as px

import requests
from time import sleep

In [34]:
### REMOVE AFTER NEW ITERATION WORKS
def compile_urls(urls):
    new_urls_complete = []

    for url in urls:
        url = url.removeprefix('[')
        url = url.removesuffix(']')
        url = url.split(',')
        new_urls = []
        for u in url:
            u = u.removeprefix("'")
            u = u.removeprefix(" '")
            u = u.removesuffix("'")
            if u == '':
                new_urls.append(None)
            else:
                new_urls.append(u)
        new_urls_complete.append(new_urls)
    return new_urls_complete

In [65]:
# Text preprocessing methods
nlp = en_core_web_sm.load()
emoji = Emoji(nlp)
nlp.add_pipe('emoji', first=True)

# Custom Tokenize
def custom_tokenize(string):
  tokens = list()
  doc = nlp(string)
  for token in doc:
    # Ignore emojis, stop-words and numerical tokens
    if not (token._.is_emoji or token.is_stop or token.like_num or token.like_url):
      tokens.append(token)
  return tokens

# Custom Normalize
def custom_normalize(tokens):
  normalized_tokens = list()
  for token in tokens:
    # Lower token and lemmatize if not a pronoun
    normalized = token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_
    normalized_tokens.append(normalized)
  return ' '.join(normalized_tokens)

# Custom Tokenize and normalize
def custom_tokenize_normalize(string):
  return custom_normalize(custom_tokenize(string))

def break_down_tweet(string, normalize=True):
    tokens = []
    doc = nlp(string)
    emojis = []
    ats = []
    hashtags = []
    emoji_num = 0
    nums = 0
    stop_word_num = 0
    for token in doc:
        try:
            if not token.like_url:
                if token.prefix_ == '@':
                    ats.append(token)
                elif token.prefix_ == '#':
                    hashtags.append(token)
                elif token._.is_emoji:
                    emoji_num += 1
                    emojis.append(token)
                elif token.like_num:
                    nums += 1
                elif token.is_stop:
                    stop_word_num += 1
                else:
                    tokens.append(token)
        except Exception as e:
            print(e)
    if normalize:
        tokens = custom_normalize(tokens)
    
    return tokens, nums, emojis, emoji_num, stop_word_num, ats, hashtags

def meta_tweets(df):
# Extract URLs, @s, #s, emojis, numbers and stop-words
# Count words, stop-words, numbers, emojis
    meta_df = df[0:0]
    for i in range(len(df)):
        row = df[i:i+1]
        tokens, nums, emojis, emoji_num, stop_word_num, ats, hashtags = break_down_tweet(row['Tweet'].iat[0])
        row['Tokens'] = ''.join(tokens)
        row['Numerical'] = nums
        row['Emojis'] = [emojis]
        row['Emoji count'] = emoji_num
        row['Stop-word count'] = stop_word_num
        row['@s'] = [ats]
        row['#s'] = [hashtags]
        meta_df = meta_df.append(row)
    return meta_df

In [36]:
def load_tweet_data():
    collected_tweet_data_df = pd.DataFrame(columns=['UserID', 'UserCreatedAt', 'TweetID', 'Tweet', 'CreatedAt', 'Spam', 'URLs Expanded', 'NumberOfFollowers',
                                                    'NumberOfTweets', 'LengthOfScreenName', 'LengthOfDescriptionInUserProfile', 'TimeDelta(Days)'])
    with open('../Data/collected_tweet_data.txt', 'r', encoding="utf-8") as f:
        reader = csv.reader(f, delimiter='\n')
        for line in reader:
            t = eval(line[0])
            uid = t['user']['id']
            ucat = t['user']['created_at']
            tid = t['id']
            text = t['text']
            cat = t['created_at']
            urls = []
            if t['entities']['urls'] != []:
                for url in t['entities']['urls']:
                    urls.append(url['expanded_url'])
            num_fol = t['user']['followers_count']
            num_t = t['user']['statuses_count']
            len_name = len(t['user']['screen_name'])
            len_desc = len(t['user']['description'])
            collected_tweet_data_df = collected_tweet_data_df.append({'UserID':uid, 'UserCreatedAt':ucat, 'TweetID':tid, 'Tweet':text, 'CreatedAt':cat, 
                                                                      'Spam':'Unknown', 'URLs Expanded':urls, 'NumberOfFollowers':num_fol , 'NumberOfTweets':num_t,
                                                                      'LengthOfScreenName': len_name, 'LengthOfDescriptionInUserProfile':len_desc}, ignore_index=True)
    for i in collected_tweet_data_df.index:
        tca = collected_tweet_data_df.at[i, 'CreatedAt']
        uca = collected_tweet_data_df.at[i, 'UserCreatedAt']
        collected_tweet_data_df.at[i, 'TimeDelta(Days)'] = (datetime.datetime.strptime(tca, '%a %b %d %H:%M:%S %z %Y') - datetime.datetime.strptime(uca, '%a %b %d %H:%M:%S %z %Y')).days
    return collected_tweet_data_df.drop_duplicates(subset='TweetID')

In [37]:
collected_tweets_df = load_tweet_data()
print(len(collected_tweets_df))

98617


In [11]:
### REMOVE AFTER NEW ITERATION WORKS
# Load tweets
try:
    collected_tweets_df = pd.read_csv('../Data/collected_tweets.csv')
except Exception as e:
    print(e)
    
collected_tweets_df = collected_tweets_df.drop_duplicates(subset=['TweetID'])
urls = collected_tweets_df['URLs Expanded']
collected_tweets_df['URLs Expanded'] = compile_urls(urls)
print('Tweets Collected:', len(collected_tweets_df))

Tweets Collected: 43286


In [69]:
# Load classifiers
try:
    lr_tfidf = load('../Models/Basic/p_lr_tfidf_m.joblib')
    knn_tfidf = load('../Models/Basic/p_knn_tfidf_m.joblib')
    svm_tfidf = load('../Models/Basic/p_svm_tfidf_m.joblib')
    mnb_tfidf = load('../Models/Basic/p_mnb_tfidf_m.joblib')
    dt_tfidf = load('../Models/Basic/p_dt_tfidf_m.joblib')
    adv_model = load('../Models/Advanced/advanced_model.joblib')
except Exception as e:
    print(e)

In [66]:
def predict_tweets_text(model, collected_tweets_df, plot=True, save_plot=False):
    collected_tweets_df = collected_tweets_df['Tweet']
    predictions = model.predict(collected_tweets_df.str.lower())
    show_results(predictions, plot, save_plot)
    
    return predictions

def predict_tweets_adv(model, collected_tweets_df, plot=True, save_plot=False):
    collected_tweets_df = meta_tweets(collected_tweets_df)
    collected_tweets_df['Tweet'] = meta_tweets(collected_tweets_df)['Tokens']
    collected_tweets_df = collected_tweets_df.drop(columns=['Tweet', 'UserID', 'UserCreatedAt', 'CreatedAt', 'TweetID', 'URLs Expanded', 'Emojis', '@s', '#s'])
    predictions = model.predict(collected_tweets_df)
    show_results(predictions, plot, save_plot)
    
    return predictions

def show_results(predictions, plot=True, save_plot=False):
    spam = 0
    not_spam = 0
    
    for p in predictions:
        if p == True:
            spam += 1
        elif p == False:
            not_spam += 1
    print('Spam:', spam, '\tNot Spam:', not_spam)
    
    if plot:
        plot_pie_chart(spam, not_spam, save_plot)
    
def plot_pie_chart(spam, not_spam, save_plot=False):
        labels = 'Spam', 'Not Spam'
        values = [spam, not_spam]
        colors= ['#ef553b', '#636efa']
        
        fig = px.pie(values=values, names=labels)
        fig.update_traces(marker=dict(colors=colors))
        fig.show()
        
        if save_plot:
            # Save as html
                fig.write_html("pie.html")


In [54]:
lr_predictions = predict_tweets_text(lr_tfidf, collected_tweets_df)

Spam: 64219 	Not Spam: 34398


In [67]:
knn_predictions = predict_tweets_text(knn_tfidf, collected_tweets_df)

Spam: 98533 	Not Spam: 84


In [70]:
svm_predictions = predict_tweets_text(svm_tfidf, collected_tweets_df)

Spam: 56181 	Not Spam: 42436


In [71]:
mnb_predictions = predict_tweets_text(mnb_tfidf, collected_tweets_df)

Spam: 79006 	Not Spam: 19611


In [72]:
dt_predictions = predict_tweets_text(dt_tfidf, collected_tweets_df)

Spam: 34099 	Not Spam: 64518


In [73]:
collected_tweets_df['LR'] = lr_predictions
collected_tweets_df['KNN'] = knn_predictions
collected_tweets_df['SVM'] = svm_predictions
collected_tweets_df['MNB'] = mnb_predictions
collected_tweets_df['DT'] = dt_predictions

collected_tweets_df.loc[collected_tweets_df['LR'] == 0, ['LR']] = -1
collected_tweets_df.loc[collected_tweets_df['KNN'] == 0, ['KNN']] = -1
collected_tweets_df.loc[collected_tweets_df['SVM'] == 0, ['SVM']] = -1
collected_tweets_df.loc[collected_tweets_df['MNB'] == 0, ['MNB']] = -1
collected_tweets_df.loc[collected_tweets_df['DT'] == 0, ['DT']] = -1

# ALTER WEIGHTING AFTER MODEL RESULTS
collected_tweets_df['LR'] *= 0.858
collected_tweets_df['KNN'] *= 0.753
collected_tweets_df['SVM'] *= 0.870
collected_tweets_df['MNB'] *= 0.844
collected_tweets_df['DT'] *= 0.827

collected_tweets_df.loc[collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] > 0, 'Spam'] = True 
collected_tweets_df.loc[collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] < 0, 'Spam'] = False
collected_tweets_df.loc[collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] != None, 'Score'] = collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] 

In [74]:
classifications = []

for index, row in collected_tweets_df.iterrows():
    spam = []
    not_spam = []
    for score in row[['LR', 'KNN', 'SVM', 'MNB', 'DT']]:
        if score > 0:
            spam.append(score)
        else:
            not_spam.append(abs(score))
    if len(spam) > 0:
        avg_spam = sum(spam)
        # avg_spam = sum(spam)/len(spam)
    else:
        avg_spam = 0
    if len(not_spam) > 0:        
        avg_not_spam = sum(not_spam)
        # avg_not_spam = sum(not_spam)/len(not_spam)
    else:
        avg_not_spam = 0
    # print(spam,not_spam)
    # print(avg_spam,avg_not_spam)
    if avg_spam > avg_not_spam:
        classifications.append(True)
    else:
        classifications.append(False)

collected_tweets_df['Spam'] = classifications
show_results(classifications)

Spam: 79116 	Not Spam: 19501


In [75]:
adv_predictions = predict_tweets_adv(adv_model, collected_tweets_df)

Spam: 12282 	Not Spam: 86335


Test for malicious URLs via VirusTotal API

In [None]:
# VirusTotal API Key
virustotal_api = "virustotal_api"
quota = 500

In [40]:
def analyse_urls(urls,quota):
    start = datetime.date.today()
    
    if not os.path.exists('../Data/tested_urls.csv'):
        tested_urls = pd.DataFrame(columns=['URL', 'Analysis', 'Harmless','Malicious'])
    else:
        tested_urls = pd.read_csv('../Data/tested_urls.csv')
        tested_urls = tested_urls.drop_duplicates(subset=['URL'])
    
    responses = []
            
    for url_list in urls:
        if (datetime.date.today() - start).days == 1:
            quota = 500
            start = datetime.date.today()
        
        if url_list != []:
            url_list_responses = []
            for url in url_list:
                if url is None:
                    url_list_responses.append(None)
                elif url.lower().startswith('https://twitter.com/') or url.lower().startswith('https://www.instagram.com/') or url.lower().startswith('https://youtu.be/'):
                    url_list_responses.append('Twitter')
                elif url.lower() in tested_urls['URL'].to_list():
                    url_list_responses.append(tested_urls[tested_urls['URL'] == url.lower()]['Analysis'])
                elif quota > 0:
                    if quota % 100 == 0:
                        print(quota, "requests left today.")
            
                    try:
                        
                        try:
                            response = requests.post('https://www.virustotal.com/api/v3/urls', headers={'x-apikey': virustotal_api}, data={'url': url})
                            scan_id = response.json()['data']['id']
                        except Exception as e:
                            print(e)
                            break
                        quota -= 1
                        
                        try:
                            response = requests.get('https://www.virustotal.com/api/v3/analyses/'+scan_id, headers={'x-apikey': virustotal_api}, params={'id': scan_id})
                            report = response.json()
                        except Exception as e:
                            print(e)
                            break
                    except:
                        print(e)
                    
                    harmless_count = report['data']['attributes']['stats']['harmless']
                    malicious_count = report['data']['attributes']['stats']['malicious']
                    if malicious_count + harmless_count == 0:
                        response = 'Unknown'
                    elif malicious_count > 0 and malicious_count <= 10:
                        response = 'Potentially Malicious'
                    elif malicious_count > 10:
                        response = 'Malicious'
                    else:
                        response = 'Harmless'
                        
                    tested_urls = tested_urls.append({'URL': url, 'Analysis': response, 'Harmless' : harmless_count, 'Malicious' : malicious_count}, ignore_index=True)
                    tested_urls.tail(1).to_csv('../Data/tested_urls.csv', mode='a', index=False, header=False)
                    url_list_responses.append(response)
                    sleep(15)
                elif quota == 0:
                    print('Used up quota for the day. Skipping URL.')
                    
                responses.append(url_list_responses)
        else:
            responses.append([])
        
    return responses, tested_urls, quota

In [None]:
responses, tested_urls, quota = analyse_urls(collected_tweets_df['URLs Expanded'], quota)

In [None]:
url_results = pd.read_csv('../Data/tested_urls.csv')
url_results = url_results.drop_duplicates(subset=['URL'])
labels = url_results['Analysis'].value_counts().axes[0].values.astype(str)
values = url_results['Analysis'].value_counts().values

fig = px.pie(values=values, names=labels, color_discrete_sequence=['grey', 'green', 'orange', 'red'])
fig.show()