# Tweet & URL Classifier

### Loads and preprocesses collected tweet data, then performs binary classification into Spam/Not-Spam by utilizing the trained models.
### Utilizes `VirusTotal API` to lookup URLs embedded into tweets to check for malicious URLs.
#### `Python 3.9.9`

In [1]:
import pandas as pd
import en_core_web_sm
from spacymoji import Emoji
import warnings
warnings.filterwarnings("ignore")

import csv
import os
import datetime

from joblib import load
import plotly.express as px

import requests
from time import sleep

***
## Natural Language Processing, Text Preprocessing and Metadata Extraction Method Definitions
>### - `custom_tokenize`(`string`):
>Splits the string into tokens using SpaCy's custom NLP tokenization. Removes emoji, stop-word, numeric and URL tokens.
***
>### - `custom_normalize`(`tokens`):
>Takes a list of tokens, lowers and lemmatizes them, and joins them into a string.
***
>### - `custom_tokenize_normalize`(`string`):
>Performs custom tokenization and normalization on a string.
***
>### - `break_down_tweet`(`string`):
>Splits the string into tokens using SpaCy's custom NLP tokenization. Extracts URL,`@`s, `#`tags, emojis, numeric, and stop-word tokens. Retains and returns the normalized left-over tokens, the amount of numeric, emoji and stop-word tokens, and the lists of emoji, `@`s, and `#`tags tokens. 
***
>### - `meta_tweets`(`df`):
>Extracts the metadata of tweets in df dataframe and returns an updated dataframe including the metadata.

In [2]:
nlp = en_core_web_sm.load()
emoji = Emoji(nlp)
nlp.add_pipe('emoji', first=True)

def custom_tokenize(string):
  tokens = list()
  doc = nlp(string)
  for token in doc:
    # Ignore emojis, stop-words and numerical tokens
    if not (token._.is_emoji or token.is_stop or token.like_num or token.like_url):
      tokens.append(token)
  return tokens

def custom_normalize(tokens):
  normalized_tokens = list()
  for token in tokens:
    # Lower token and lemmatize if not a pronoun
    normalized = token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_
    normalized_tokens.append(normalized)
  return ' '.join(normalized_tokens)

def custom_tokenize_normalize(string):
  return custom_normalize(custom_tokenize(string))

def break_down_tweet(string, normalize=True):
    tokens = []
    doc = nlp(string)
    emojis = []
    ats = []
    hashtags = []
    emoji_num = 0
    nums = 0
    stop_word_num = 0
    for token in doc:
        try:
            if not token.like_url:
                if token.prefix_ == '@':
                    ats.append(token)
                elif token.prefix_ == '#':
                    hashtags.append(token)
                elif token._.is_emoji:
                    emoji_num += 1
                    emojis.append(token)
                elif token.like_num:
                    nums += 1
                elif token.is_stop:
                    stop_word_num += 1
                else:
                    tokens.append(token)
        except Exception as e:
            print(e)
    if normalize:
        tokens = custom_normalize(tokens)
    
    return tokens, nums, emojis, emoji_num, stop_word_num, ats, hashtags

def meta_tweets(df):
    meta_df = df[0:0]
    for i in range(len(df)):
        row = df[i:i+1]
        tokens, nums, emojis, emoji_num, stop_word_num, ats, hashtags = break_down_tweet(row['Tweet'].iat[0])
        row['Tokens'] = ''.join(tokens)
        row['Numerical'] = nums
        row['Emojis'] = [emojis]
        row['Emoji count'] = emoji_num
        row['Stop-word count'] = stop_word_num
        row['@s'] = [ats]
        row['#s'] = [hashtags]
        meta_df = meta_df.append(row)
    return meta_df

>### - `load_tweet_data`():
>Loads the collected tweets into memory, performs preprocessing and extracts metadata of the first 100000 tweets collected. Returns a dataframe with data and extracted metadata.

In [3]:
def load_tweet_data():
    collected_tweet_data_df = pd.DataFrame(columns=['UserID', 'UserCreatedAt', 'TweetID', 'Tweet', 'CreatedAt', 'Spam', 'URLs Expanded', 'NumberOfFollowers',
                                                    'NumberOfTweets', 'LengthOfScreenName', 'LengthOfDescriptionInUserProfile', 'TimeDelta(Days)'])
    with open('../Data/collected_tweet_data.txt', 'r', encoding="utf-8") as f:
        reader = csv.reader(f, delimiter='\n')
        for line in reader:
            if len(collected_tweet_data_df) < 100000:
                t = eval(line[0])
                uid = t['user']['id']
                ucat = t['user']['created_at']
                tid = t['id']
                text = t['text']
                cat = t['created_at']
                urls = []
                if t['entities']['urls'] != []:
                    for url in t['entities']['urls']:
                        urls.append(url['expanded_url'])
                num_fol = t['user']['followers_count']
                num_t = t['user']['statuses_count']
                len_name = len(t['user']['screen_name'])
                len_desc = len(t['user']['description'])
                collected_tweet_data_df = collected_tweet_data_df.append({'UserID':uid, 'UserCreatedAt':ucat, 'TweetID':tid, 'Tweet':text, 'CreatedAt':cat, 
                                                                        'Spam':'Unknown', 'URLs Expanded':urls, 'NumberOfFollowers':num_fol , 'NumberOfTweets':num_t,
                                                                        'LengthOfScreenName': len_name, 'LengthOfDescriptionInUserProfile':len_desc}, ignore_index=True)
            else:
                break
            
    for i in collected_tweet_data_df.index:
        tca = collected_tweet_data_df.at[i, 'CreatedAt']
        uca = collected_tweet_data_df.at[i, 'UserCreatedAt']
        collected_tweet_data_df.at[i, 'TimeDelta(Days)'] = (datetime.datetime.strptime(tca, '%a %b %d %H:%M:%S %z %Y') - datetime.datetime.strptime(uca, '%a %b %d %H:%M:%S %z %Y')).days
    return collected_tweet_data_df.drop_duplicates(subset='TweetID')

Load collected tweets with their metadata in `collected_tweets_df` dataframe.

In [4]:
collected_tweets_df = load_tweet_data()

***
## Tweet Classification (Spam/Not Spam)
Load classifiers:
- Logistic Regression w/ TF-IDF vectorization: `lr_tfidf`
- KNearest Neighbors w/ TF-IDF vectorization: `knn_tfidf`
- SVM (Support Vector Machine) w/ TF-IDF vectorization: `svm_tfidf`
- MNB (Multinomial Naive Bayes) w/ TF-IDF vectorization: `mnb_tfidf`
- DT (Decision Tree) w/ TF-IDF vectorization: `dt_tfidf`

- Advanced SVM Model w/ TF-IDF vectorization: `adv_model`

In [9]:
try:
    lr_tfidf = load('../Models/Basic/p_lr_tfidf_m.joblib')
    knn_tfidf = load('../Models/Basic/p_knn_tfidf_m.joblib')
    svm_tfidf = load('../Models/Basic/p_svm_tfidf_m.joblib')
    mnb_tfidf = load('../Models/Basic/p_mnb_tfidf_m.joblib')
    dt_tfidf = load('../Models/Basic/p_dt_tfidf_m.joblib')
    adv_model = load('../Models/Advanced/advanced_model.joblib')
except Exception as e:
    print(e)

***
## Tweet Classification Method Definitions:
>### - `predict_tweets_basic`(`model`, `collected_tweets_df`):
>Utilizes the specified basic model with the tweets' textual data to classify them into Spam/Not-Spam.
***
>### - `predict_tweets_advanced`(`model`, `collected_tweets_df`):
>Utilizes the specified advanced model with the tweets' data to classify them into Spam/Not-Spam.
***
>### - `show_results`(`predictions`):
>Displays the amount of Spam and Not-Spam tweets.
***
>### - `plot_pie_chart`(`spam`, `not_spam`):
>Displays an interactive pie chart with the results.

In [None]:
def predict_tweets_basic(model, collected_tweets_df, plot=True, save_plot=False):
    collected_tweets_df = collected_tweets_df['Tweet']
    predictions = model.predict(collected_tweets_df.str.lower())
    show_results(predictions, plot, save_plot)
    
    return predictions

def predict_tweets_adv(model, collected_tweets_df, plot=True, save_plot=False):
    collected_tweets_df = meta_tweets(collected_tweets_df)
    collected_tweets_df['Tweet'] = meta_tweets(collected_tweets_df)['Tokens']
    collected_tweets_df = collected_tweets_df.drop(columns=['Tweet', 'UserID', 'UserCreatedAt', 'CreatedAt', 'TweetID', 'URLs Expanded', 'Emojis', '@s', '#s'])
    predictions = model.predict(collected_tweets_df)
    show_results(predictions, plot, save_plot)
    
    return predictions

def show_results(predictions, plot=True, save_plot=False):
    spam = 0
    not_spam = 0
    
    for p in predictions:
        if p == True:
            spam += 1
        elif p == False:
            not_spam += 1
    print('Spam:', spam, '\tNot Spam:', not_spam)
    
    if plot:
        plot_pie_chart(spam, not_spam, save_plot)
    
def plot_pie_chart(spam, not_spam, save_plot=False):
        labels = 'Spam', 'Not Spam'
        values = [spam, not_spam]
        colors= ['#ef553b', '#636efa']
        
        fig = px.pie(values=values, names=labels)
        fig.update_traces(marker=dict(colors=colors))
        fig.show()
        
        if save_plot:
            # Save as html
                fig.write_html("pie.html")


### Basic Logistic Regression with TF-IDF Predictions

In [11]:
lr_predictions = predict_tweets_basic(lr_tfidf, collected_tweets_df)

Spam: 65116 	Not Spam: 34884


### Basic K-Nearest Neighbors with TF-IDF Predictions

In [12]:
knn_predictions = predict_tweets_basic(knn_tfidf, collected_tweets_df)

Spam: 99916 	Not Spam: 84


### Basic SVM (Support Vector Machine) with TF-IDF Predictions

In [13]:
svm_predictions = predict_tweets_basic(svm_tfidf, collected_tweets_df)

Spam: 57029 	Not Spam: 42971


### Basic Multinomial Naive Bayes with TF-IDF Predictions

In [14]:
mnb_predictions = predict_tweets_basic(mnb_tfidf, collected_tweets_df)

Spam: 80139 	Not Spam: 19861


### Basic Decision Tree with TF-IDF Predictions

In [15]:
dt_predictions = predict_tweets_basic(dt_tfidf, collected_tweets_df)

Spam: 34575 	Not Spam: 65425


In [27]:
collected_tweets_df['LR'] = lr_predictions
collected_tweets_df['KNN'] = knn_predictions
collected_tweets_df['SVM'] = svm_predictions
collected_tweets_df['MNB'] = mnb_predictions
collected_tweets_df['DT'] = dt_predictions

collected_tweets_df.loc[collected_tweets_df['LR'] == 0, ['LR']] = -1
collected_tweets_df.loc[collected_tweets_df['KNN'] == 0, ['KNN']] = -1
collected_tweets_df.loc[collected_tweets_df['SVM'] == 0, ['SVM']] = -1
collected_tweets_df.loc[collected_tweets_df['MNB'] == 0, ['MNB']] = -1
collected_tweets_df.loc[collected_tweets_df['DT'] == 0, ['DT']] = -1

collected_tweets_df['LR'] *= 0.906
collected_tweets_df['KNN'] *= 0.727
collected_tweets_df['SVM'] *= 0.913
collected_tweets_df['MNB'] *= 0.893
collected_tweets_df['DT'] *= 0.797

collected_tweets_df.loc[collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] > 0, 'Spam'] = True 
collected_tweets_df.loc[collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] < 0, 'Spam'] = False
collected_tweets_df.loc[collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] != None, 'Score'] = collected_tweets_df['LR'] + collected_tweets_df['KNN'] + collected_tweets_df['SVM'] + collected_tweets_df['MNB'] + collected_tweets_df['DT'] 

In [28]:
classifications = []

for index, row in collected_tweets_df.iterrows():
    spam = []
    not_spam = []
    for score in row[['LR', 'KNN', 'SVM', 'MNB', 'DT']]:
        if score > 0:
            spam.append(score)
        else:
            not_spam.append(abs(score))
    if len(spam) > 0:
        avg_spam = sum(spam)
        # avg_spam = sum(spam)/len(spam)
    else:
        avg_spam = 0
    if len(not_spam) > 0:        
        avg_not_spam = sum(not_spam)
        # avg_not_spam = sum(not_spam)/len(not_spam)
    else:
        avg_not_spam = 0
    # print(spam,not_spam)
    # print(avg_spam,avg_not_spam)
    if avg_spam > avg_not_spam:
        classifications.append(True)
    else:
        classifications.append(False)

collected_tweets_df['Spam'] = classifications
show_results(classifications)

Spam: 80248 	Not Spam: 19752


### Advanced SVM (Support Vector Machine) with TF-IDF vectorization and One-Hot Encoding for numeric metadata Predictions

In [18]:
adv_predictions = predict_tweets_adv(adv_model, collected_tweets_df)

Spam: 12465 	Not Spam: 87535


***
## Test for malicious URLs via VirusTotal API
VirusTotal API Keys [*Retrieve and replace keys from `API Keys.zip` using the password provided*]

In [5]:
virustotal_api = "virustotal_api"
quota = 500

>### - `analyse_urls`(`urls`, `quota`):
>Utilizes VirusTotal API to lookup a list of URLs (`urls`), and saves the results in `tested_urls.csv`. Excludes Twitter, Instagram and YouTube domains to increase efficiency, as they are considered safe.

In [6]:
def analyse_urls(urls,quota):
    start = datetime.date.today()
    
    if not os.path.exists('../Data/tested_urls.csv'):
        tested_urls = pd.DataFrame(columns=['URL', 'Analysis', 'Harmless','Malicious'])
    else:
        tested_urls = pd.read_csv('../Data/tested_urls.csv')
        tested_urls = tested_urls.drop_duplicates(subset=['URL'])
    
    responses = []
            
    for url_list in urls:
        if (datetime.date.today() - start).days == 1:
            quota = 500
            start = datetime.date.today()
        
        if url_list != []:
            url_list_responses = []
            for url in url_list:
                if url is None:
                    url_list_responses.append(None)
                elif url.lower().startswith('https://twitter.com/') or url.lower().startswith('https://www.instagram.com/') or url.lower().startswith('https://youtu.be/'):
                    url_list_responses.append('Twitter')
                elif url.lower() in tested_urls['URL'].to_list():
                    url_list_responses.append(tested_urls[tested_urls['URL'] == url.lower()]['Analysis'])
                elif quota > 0:
                    if quota % 100 == 0:
                        print(quota, "requests left today at", datetime.datetime.now(), ".")
            
                    try:
                        
                        try:
                            response = requests.post('https://www.virustotal.com/api/v3/urls', headers={'x-apikey': virustotal_api}, data={'url': url})
                            scan_id = response.json()['data']['id']
                        except Exception as e:
                            print(e)
                            break
                        quota -= 1
                        
                        try:
                            response = requests.get('https://www.virustotal.com/api/v3/analyses/'+scan_id, headers={'x-apikey': virustotal_api}, params={'id': scan_id})
                            report = response.json()
                        except Exception as e:
                            print(e)
                            break
                    except:
                        print(e)
                    
                    harmless_count = report['data']['attributes']['stats']['harmless']
                    malicious_count = report['data']['attributes']['stats']['malicious']
                    if malicious_count + harmless_count == 0:
                        response = 'Unknown'
                    elif malicious_count > 0 and malicious_count <= 10:
                        response = 'Potentially Malicious'
                    elif malicious_count > 10:
                        response = 'Malicious'
                    else:
                        response = 'Harmless'
                        
                    tested_urls = tested_urls.append({'URL': url, 'Analysis': response, 'Harmless' : harmless_count, 'Malicious' : malicious_count}, ignore_index=True)
                    tested_urls.tail(1).to_csv('../Data/tested_urls.csv', mode='a', index=False, header=False)
                    url_list_responses.append(response)
                    sleep(15)
                elif quota == 0:
                    print('Used up quota for the day. Skipping URL.')
                    
                responses.append(url_list_responses)
        else:
            responses.append([])
        
    return responses, tested_urls, quota

Analyse URLs

In [None]:
responses, tested_urls, quota = analyse_urls(collected_tweets_df['URLs Expanded'], quota)

### Displays results of analysed URLs by 83 contributors:
>- ### `Potentially Malicious`: If `Malicious` count is greater than `0`.
>- ### `Harmless`: If `Harmless` count is `83`.
>- ### `Unknown`: If no results are available by any contributor.

In [13]:
url_results = pd.read_csv('../Data/tested_urls.csv')
url_results = url_results.drop_duplicates(subset=['URL'])
labels = url_results['Analysis'].value_counts().axes[0].values.astype(str)
values = url_results['Analysis'].value_counts().values
print(len(url_results), 'URLs analysed:')
for i in range(len(labels)):
    print(labels[i], ':', values[i])

fig = px.pie(values=values, names=labels, color_discrete_sequence=['grey', 'green', 'orange', 'red'])
fig.show()

3085 URLs analysed:
Unknown : 2888
Harmless : 151
Potentially Malicious : 46
