In [1]:
# Import analysis tools
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import re

from sklearn.feature_extraction.text import TfidfVectorizer

# Import algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm

# Import metrics for comparing approaches
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from IPython.display import display

# Jupyter Notebook inline graph fix
%matplotlib inline

In [2]:
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\JackC\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JackC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JackC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JackC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JackC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\JackC\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-

True

# Import the Data
## Training Data

In [26]:
train_data = pd.read_csv('mediaeval-2015-trainingset.txt', sep='\\t', engine='python', encoding='utf-8')

display(train_data.head())
display(train_data.info())
display(train_data['label'].value_counts())

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¬øSe acuerdan de la pel√≠cula: ‚ÄúEl d√≠a despu√©s de ma√±ana‚Äù? Me recuerda a lo que est√° pasando con el hurac√°n #Sandy. http://t.co/JQQeRPwN,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,"@milenagimon: Miren a Sandy en NY! Tremenda imagen del hurac√°n. Parece el ""D√≠a de la Independencia 2"" http://t.co/41jUweux REAL! RT.",192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Hurac√°n Sandy, me recuerda a la pel√≠cula D√≠a de la Independencia #ID4 #Sandy http://t.co/PTdAXABZ",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sandy #statueofliberty üóΩ http://t.co/Ex61doZk,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14483 entries, 0 to 14482
Data columns (total 7 columns):
tweetId       14483 non-null int64
tweetText     14483 non-null object
userId        14483 non-null int64
imageId(s)    14483 non-null object
username      14483 non-null object
timestamp     14483 non-null object
label         14483 non-null object
dtypes: int64(2), object(5)
memory usage: 792.2+ KB


None

fake     6841
real     5009
humor    2633
Name: label, dtype: int64

## Testing Data

In [25]:
test_data = pd.read_csv('mediaeval-2015-testset.txt', sep='\\t', engine='python', encoding='utf-8')

display(test_data.head())
display(test_data.info())
display(test_data['label'].value_counts())

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,578854927457349632,kereeen RT @Shyman33: Eclipse from ISS.... http://t.co/je2hcFpVfN,70824972,eclipse_01,peay_s,Fri Mar 20 09:45:43 +0000 2015,fake
1,578874632670953472,Absolutely beautiful! RT @Shyman33: Eclipse from ISS.... http://t.co/oqwtTL0ThS,344707006,eclipse_01,JaredUcanChange,Fri Mar 20 11:04:02 +0000 2015,fake
2,578891261353984000,‚Äú@Shyman33: Eclipse from ISS.... http://t.co/C0VfboScRj‚Äù Ïö∞Ï£ºÏóêÏÑúÎ≥∏ 3.20 ÏùºÏãù Wow! amazing!,224839607,eclipse_01,tpjp1231,Fri Mar 20 12:10:06 +0000 2015,fake
3,578846612312748032,Eclipse from ISS.... http://t.co/En87OtvsU6,134543073,eclipse_01,Shyman33,Fri Mar 20 09:12:41 +0000 2015,fake
4,578975333841551360,"""@ebonfigli: √âclipse vue de l'ISS... Autre chose... http://t.co/yNBN7c4O51""\n\nLa cr√©ation divine n'a pas de limite üòç",1150728872,eclipse_01,Epimethee_,Fri Mar 20 17:44:11 +0000 2015,fake


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3781 entries, 0 to 3780
Data columns (total 7 columns):
tweetId       3781 non-null int64
tweetText     3781 non-null object
userId        3781 non-null int64
imageId(s)    3781 non-null object
username      3781 non-null object
timestamp     3781 non-null object
label         3781 non-null object
dtypes: int64(2), object(5)
memory usage: 206.9+ KB


None

fake    2564
real    1217
Name: label, dtype: int64

# Visualisation

In [None]:
sns.countplot(train_data['label']).set_title("Training Dataset")

In [5]:
# Replace the humor labels with fake
train_data['label'].replace("humor", "fake", inplace=True)
train_data['label'].value_counts()

fake    9474
real    5009
Name: label, dtype: int64

In [None]:
sns.countplot(train_data['label']).set_title("Training Dataset (humor converted to fake)")

In [None]:
sns.countplot(test_data['label']).set_title("Test Dataset")

## Length of Text

### All Tweets

In [None]:
# Copy the tweetText and label column from the training data - label is required to break up based on real/fake
length = train_data[['tweetText', 'label']].copy()

# Calculate the length of each tweet (this is before any pre processing) and add it as a column
length['length'] = length['tweetText'].str.len()

def plotLengths(data, title) :
    plt.clf()
    sns.distplot(data['length'], hist=True, kde=False, vertical=True, bins=100).set_title(title)
    # Limit the length axis to remove any anomalous results (due to character encoding)
    plt.ylim(0, 150)
    plt.show()
    
plotLengths(length, "All Tweet Lengths Histogram")

### Fake Tweets

In [None]:
plotLengths(length.loc[length['label'] == "fake"], "Fake Tweet Lengths Histogram")

### Real Tweets

In [None]:
plotLengths(length.loc[length['label'] == "real"], "Real Tweet Lengths Histogram")

# Pre-Processing

In [10]:
# Extract the tweetText and convert the labels to binary
def injestData(old, train) :
    # Extract only the tweetText and label
    new = old[['tweetText', 'label']].copy()
    
    # Convert the text labels to numeric
    new['label'] = new.label.eq('fake').mul(1)
    
    # Remove noise from the training data
    if train :
        # Remove duplicate tweetText
        new.drop_duplicates(subset='tweetText', keep='first', inplace=True)

        # Remove direct retweets (tweets that start with RT)
        new = new[~new.tweetText.str.startswith('RT')]

        # Reset the index after dropping rows
        new.reset_index(drop=True, inplace=True)

    return new

train = injestData(train_data, True)
test = injestData(test_data, False)

## Data Cleaning

In [11]:
# Clean the tweetText
def cleanText(data) :
    tweets = []
    stemmer = WordNetLemmatizer()
    
    for t in range(0, len(data)) :
        # Remove URLs
        tweet = re.sub(r'https?://[^\s]+', '', str(data[t]))

        # Remove mentions
        tweet = re.sub(r'@\w+', '', tweet)

        # Remove hashtags
        # tweet = re.sub(r'#\w+', '', tweet)
        
        # Remove non-english characters
        tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
        
        # Remove all the special characters
        tweet = re.sub(r'\W', ' ', tweet)
        
        # Remove all single characters
        tweet = re.sub(r'\s+[a-zA-Z]\s+', ' ', tweet)
        
        # Remove single characters from the start
        tweet = re.sub(r'\^[a-zA-Z]\s+', ' ', tweet)
    
        # Substituting multiple spaces with a single space
        tweet = re.sub(r'\s+', ' ', tweet, flags=re.I)

        # Convert to lower case
        tweet = tweet.lower()

        # Lemmatisation
        tweet = tweet.split()
        
        tweet = [stemmer.lemmatize(word) for word in tweet]
        tweet = ' '.join(tweet)
        
        tweets.append(tweet)
    
    return tweets

# Clean the tweet text content for the training and test data
train['tweetText'] = cleanText(train['tweetText'])
test['tweetText'] = cleanText(test['tweetText'])

# Prevent pandas from truncating the data so we can confirm the URL's are removed
pd.set_option('display.max_colwidth', -1)

display(train.head())
display(test.head())

Unnamed: 0,tweetText,label
0,se acuerdan de la pelcula el da despus de maana me recuerda lo que est pasando con el huracn sandy,1
1,miren sandy en ny tremenda imagen del huracn parece el da de la independencia 2 real rt,1
2,buena la foto del huracn sandy me recuerda la pelcula da de la independencia id4 sandy,1
3,scary shit hurricane ny,1
4,my fave place in the world nyc hurricane sandy statueofliberty,1


Unnamed: 0,tweetText,label
0,kereeen rt eclipse from i,1
1,absolutely beautiful rt eclipse from i,1
2,eclipse from i 3 20 wow amazing,1
3,eclipse from i,1
4,clipse vue de i autre chose cration divine a pa de limite,1


### Word Frequency (For Visualisation)

In [None]:
def plotWordFrequency (data, title) : 
    all_words_list = []
    for tweet in data :
        all_words_list.append(nltk.tokenize.word_tokenize(tweet))

    all_words = [i for j in all_words_list for i in j]

    # Remove words that are specific to events which aren't helpful in spotting patterns
    event_words = ['sandy', 'hurricane', 'hurricanesandy', 'new', 'nyc', 'ny', 'york', 'statue', 'statueofliberty', 
                   'shark', 'newyork', 'tomb', 'sochi', 'soldier', 'liberty', 'jersey', 'nj', 'mh370', 'hurac√°n', 
                   'boston', 'manhattan', 'bringbackourgirls', 'columbianchemicals', 'flooding', 'flood', 'cuba']
    platform_words = ['rt']
    en_stop_words = nltk.corpus.stopwords.words('english')
    sp_stop_words = nltk.corpus.stopwords.words('spanish')
    block_words = en_stop_words + sp_stop_words + event_words + platform_words

    fd = nltk.FreqDist(w.lower() for w in all_words if w not in block_words)
    fd_top = fd.most_common(20)
    fd.plot(30, title=title)

In [None]:
plotWordFrequency(train['tweetText'].loc[train['label'] == 1], "Most Common Words in Fake Tweets")

In [None]:
plotWordFrequency(train['tweetText'].loc[train['label'] == 0], "Most Common Words in Real Tweets")

## Attributes

Tokenisation

In [21]:
# Tokenise the tweetText
def tokenize(data) :
    return data.apply(nltk.tokenize.word_tokenize)

train_tokens = tokenize(train['tweetText'])
test_tokens = tokenize(test['tweetText'])

display(train_tokens[3:6])

## Transformers

In [22]:
# POS tag the tokenized text
def pos(data) :
    return data.apply(nltk.tag.pos_tag)

train_pos = pos(train_tokens)
test_pos = pos(train_tokens)

display(train_pos[3:6])

In [23]:
# NER tagging the POS text - ineffective
def ner(data) :
    return data.apply(nltk.ne_chunk)

train_ner = ner(train_pos)
test_ner = ner(test_pos)

display(train_ner[3:6])

0    [(I, PRP), (am, VBP), (in, IN), (new, JJ), (york, NN), (and, CC), (hurricanesandy, NN), (is, VBZ), (kicking, VBG), (off, RP)]
Name: tweetText, dtype: object


In [None]:
# Generate n-grams
list_ngram = list(ngrams(sequence = s, n = 2))

# Implementation

In [None]:
# Removes stop words - test using Spanish ones as well
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X_train = tfidfconverter.fit_transform(train['tweetText']).toarray()
X_test = tfidfconverter.transform(test['tweetText']).toarray()

y_train = train['label']
y_test = test['label']

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))