In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import nltk
import re
from textblob import TextBlob

In [2]:
def CountHashtags(data):
    """
    count words that start with # given a data string
    """
    assert isinstance(data, str)
        
    count = len([s for s in data.split() if s.startswith('#')])
    return count
    

def CountMentions(data):
    """
    count words that start with @ given a data string
    """
    assert isinstance(data, str)
    
    count = len([s for s in data.split() if s.startswith('@')])
    return count

def RemoveHashtags(data):
    """
    Removes hashtags from a given string
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    hashtag_regex = '#[A-Za-z0-9]+'
    
    data = ' '.join(re.sub(hashtag_regex, ' ', data).split())
    
    return data
    

def RemoveMentions(data):
    """
    Removes mentions from a given string
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    mention_regex = '@[A-Za-z0-9]+'
    
    data = ' '.join(re.sub(mention_regex, ' ', data).split())
    
    return data
    

def CountStopWords(data, stop_words):
    """
    Given a text and a list of stop words, return count of step words in text
    """
    assert isinstance(data, str)
    assert isinstance(stop_words, set)
    
    count = len([s for s in data.split() if s in stop_words])
    return count

def RemoveStopWords(data, stop_words):
    """
    Given a test and a list of stop words, remove stop words from text
    """
    assert isinstance(data, str)
    assert isinstance(stop_words, set)
    
    data = ' '.join([s for s in data.split() if s not in stop_words])
    return data

def GetWordCount(data):
    """
    count number of words in a given text
    """
    assert isinstance(data, str)
    
    count = len([s for s in data.split()])
    return count

def GetCharCount(data):
    """
    count number of characters in a given text
    """
    assert isinstance(data, str)
    
    return len(data)

def GetAvgWordLength(data):
    """
    Given a text, return average word length
    """
    assert isinstance(data, str)
    
    words = data.split()
    
    Len = 0
    for w in words:
        Len += len(w)
    
    Ans = int(Len / len(words))
    return Ans

def GetNumericDigitsCount(data):
    """
    Given a text, count number of numerical digits (not imbedded)
    """
    assert isinstance(data, str)
    
    count = len([s for s in data.split() if s.isdigit()])
    return count

def CountContractions(data, contractions):
    """
    Given a text and map of contractions, return count of contractions in text
    
    Make sure all characters are lowercase
    """
    assert isinstance(data, str)
    assert isinstance(contractions, dict)
    
    count = len([s for s in data.split() if s in contractions])
    return count

def ExpandContractions(data, contractions):
    """
    Given a text, and a map of contractions, replace contraction in the given text
    
    Make sure all character are lowercase
    """
    assert isinstance(data, str)
    assert isinstance(contractions, dict)
    
    for key in contractions:
        value = contractions[key]
        data = data.replace(key, value)
    
    return data

def GetEmailCount(data):
    """
    Given a text, count number of emails 
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    email_regex = '([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
    
    count = len(re.findall(email_regex, data))
    return count

def RemoveEmails(data):
    """
    Given a text, remove all emails from it
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    email_regex = email_regex = '([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
        
    data = re.sub(email_regex, '', data)
    
    return data

def ExtractEmails(data):
    """
    Given a text, return all emails in this text
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    email_regex = '([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
    
    emails = re.findall(email_regex, data)
    
    return emails

def GetUrlCount(data):
    """
    Given a text, count Urls it contains
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    url_regex = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
    
    count = len(re.findall(url_regex, data))
    
    return count

def RemoveUrls(data):
    """
    Given a text, remove Urls it contains
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    url_regex = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
    
    data = re.sub(url_regex, '', data)
    
    return data

def ExtractUrls(data):
    """
    Given a text, return all Urls it contains
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    url_regex = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
    
    urls = re.findall(url_regex, data)
    
    return urls

def RemoveSpecialChars(data):
    """
    Given a text, remove special (non-alphanumeric) characters
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    special_regex = '[^A-Z a-z 0-9-]+'
    
    data = re.sub(special_regex, '', data)
    
    return data

def RemoveMultipleSpaces(data):
    """
    Given a text, remove multiple whistepaces
    """
    assert isinstance(data, str)
    
    data = ' '.join(data.split())
    
    return data

def RemoveHTMLTags(data):
    """
    Given a text remove any existing html tags
    
    Assuming BeautifulSoup is imported from bs4
    """
    assert isinstance(data, str)
    
    data = BeautifulSoup(data, 'lxml').getText()
    
    return data

def RemoveAccentedChars(data):
    """
    Given a text, remove accented chars
    """
    assert isinstance(data, str)
    
    data = unicodedata.normalize('NFKD', data).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return data

def CorrectSpelling(data):
    """
    Given a text, correct spelling of words using textblob
    
    Assuming TextBlob is imported from textblob
    """
    assert isinstance(data, str)
    data = TextBlob(data).correct()
    return data
    

def RemovePunctuations(data):
    """
    Given a text, remove its punctuation
    
    Assuming re is imported
    """
    assert isinstance(data, str)
    
    punc_regex = '[\.\,\!\?\:\;\-\=]'
    
    data = re.sub(punc_regex, '', data)
    return data

contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"btw": "by the way",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"gonna": "going to",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [3]:
def CountSentiments(sentiments):
    cnt0 = 0
    cnt1 = 0
    for sent in sentiments:
        if sent == 1:
            cnt1 += 1
        else:
            cnt0 += 1
            
    print(cnt0,cnt1)

In [4]:
Columns = ['sentiment', 'IDs', 'date', 'flag', 'user', 'text']
RawData = pd.read_csv('twitter_data.csv', encoding = 'ISO-8859-1', names = Columns)
RawData.head(10)

Unnamed: 0,sentiment,IDs,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [5]:
# drop unnecessary columns
RawData = RawData.drop(columns = ['IDs','date','flag','user'])

In [6]:
# define constants
N = 15000
K = 300

In [7]:
# shuffle data
RawData = RawData.sample(frac = 1).reset_index(drop = True)
RawData.head()

Unnamed: 0,sentiment,text
0,0,Another case of the Monday blues. Exhausted fr...
1,4,@JonathanRKnight rain + NKOTB = 5 sexy wet gro...
2,4,just back from a gr8 walk under a cloudless bl...
3,4,Name a common filler used to simulate coconut ...
4,0,@MafiaShe i only has 12


In [8]:
# take a subset of the data points
Data = RawData.iloc[0:N]

In [9]:
# normalize sentiment
Data.sentiment = Data.sentiment.apply(lambda x : (1 if x == 4 else 0))
Data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,sentiment,text
0,0,Another case of the Monday blues. Exhausted fr...
1,1,@JonathanRKnight rain + NKOTB = 5 sexy wet gro...
2,1,just back from a gr8 walk under a cloudless bl...
3,1,Name a common filler used to simulate coconut ...
4,0,@MafiaShe i only has 12


In [10]:
# turn to lowercase
Data.text = Data.text.apply(lambda x : x.lower())

In [12]:
FullText = ' '.join([s for s in Data.text])

In [13]:
from spacy.lang.en.stop_words import STOP_WORDS

print(f"Number of Hashtags: {CountHashtags(FullText)}")
print(f"Number of Mentions: {CountMentions(FullText)}")
print(f"Number of Stop Words: {CountStopWords(FullText,STOP_WORDS)}")
print(f"Number of Emails: {GetEmailCount(FullText)}")
print(f"Number of Urls: {GetUrlCount(FullText)}")
print(f"Number of Words: {GetWordCount(FullText)}")
print(f"Number of chars: {GetCharCount(FullText)}")
print(f"Stop word percentage: {(CountStopWords(FullText,STOP_WORDS) / GetWordCount(FullText) * 100)}")

Number of Hashtags: 424
Number of Mentions: 7382
Number of Stop Words: 83400
Number of Emails: 6
Number of Urls: 693
Number of Words: 196494
Number of chars: 1121195
Stop word percentage: 42.444044092949405


In [14]:
print(f"Average Word Length: {GetAvgWordLength(FullText)}")

Average Word Length: 4


In [15]:
from nltk.corpus import stopwords

stop_words_nltk = set(stopwords.words('english'))
print(f"Stop Words for nltk:{CountStopWords(FullText,stop_words_nltk)}")

Stop Words for nltk:75552


In [16]:
Data.text = Data.text.apply(lambda x : RemoveHashtags(x))
Data.text = Data.text.apply(lambda x : RemoveMentions(x))
Data.text = Data.text.apply(lambda x : ExpandContractions(x, contractions))
Data.text = Data.text.apply(lambda x : RemoveStopWords(x, stop_words_nltk))
Data.text = Data.text.apply(lambda x : RemoveEmails(x))
Data.text = Data.text.apply(lambda x : RemoveUrls(x))
Data.text = Data.text.apply(lambda x : RemoveSpecialChars(x))
Data.text = Data.text.apply(lambda x : RemoveMultipleSpaces(x))
Data.text = Data.text.apply(lambda x : RemovePunctuations(x))

In [20]:
Data.head(20)

Unnamed: 0,sentiment,text
0,0,another case monday blues exhausted thinking m...
1,1,rain nkotb 5 sexy wet grown men wish concert
2,1,back gr8 walk cloudless blue heaven lifes wond...
3,1,name common filler used simulate coconut milk ...
4,0,12
5,1,im ya love rice green beans cheese fries crazi...
6,0,0robertpatt now
7,0,watching tele went dentist
8,1,lolls thats pretty ace
9,1,sinners would loved girly night in plan sumthi...


In [21]:
nlp = spacy.load('en_core_web_md')

In [22]:
def Vectorize(x):
    x = x.lower()
    document = nlp(x)
    VecSum = np.zeros(shape = (1,K))
    count = 0
    for word in document:
        if word.has_vector:
            VecSum += word.vector.T
            count += 1
    if count == 0:
        return VecSum
    return (VecSum / count)

In [23]:
DataMatrix = np.zeros(shape = (1,300))
print(DataMatrix.shape)
for text in Data.text:
    DataMatrix = np.append(DataMatrix, Vectorize(text), axis = 0)
DataMatrix = DataMatrix[1:]
print(DataMatrix)

(1, 300)
[[ 2.75647527e-02  2.65810617e-01 -1.20142538e-01 ...  7.30983850e-02
  -1.34846074e-01  2.14938232e-02]
 [-6.86142244e-02  1.23579780e-01 -6.00801477e-03 ... -4.20674475e-02
   9.10800017e-02 -1.07077440e-01]
 [ 2.08118141e-01  1.75659141e-01 -8.58965028e-02 ...  1.16581925e-01
  -4.68709982e-02  6.23359997e-02]
 ...
 [ 1.29627411e-04  2.58806168e-01 -1.29274338e-01 ... -7.29843341e-02
  -3.50386677e-02  1.39453499e-01]
 [ 2.32800022e-02  4.42194998e-01  4.57715012e-01 ... -2.16530003e-02
  -2.07570001e-01  6.07280009e-02]
 [-6.45246680e-02 -3.39993323e-01 -4.13046663e-01 ... -1.70766662e-01
   1.10639667e-01  1.86332002e-01]]


In [24]:
# store labels in numpy array
Labels = Data['sentiment'].values

In [25]:
from sklearn.model_selection import train_test_split

DataTrain, DataTest, LabelTrain, LabelTest = train_test_split(DataMatrix, Labels, test_size = 0.1, random_state = 1)

In [26]:
from sklearn.svm import SVC

SVM_Model = SVC(kernel = 'linear')
SVM_Model.fit(DataTrain, LabelTrain)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
print(SVM_Model.score(DataTest,LabelTest))

0.7193333333333334


In [28]:
import torch.nn as nn
import torch
from torch.autograd import Variable
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn import preprocessing
import torch.nn.functional as F
import torch.utils.data as Dataa
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

H1 = 100
H2 = 100
OP = 1
LR = 0.01
BATCH_SIZE = 32
EPOCH = 100


In [29]:
class NeuralNet(nn.Module):

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(K, H1)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(H1, H2)
        self.prelu = nn.PReLU(1)
        self.out = nn.Linear(H2, OP)
        self.out_act = nn.Sigmoid()
        
    def forward(self, input_):
        a1 = self.fc1(input_)
        h1 = self.relu1(a1)
        dout = self.dout(h1)
        a2 = self.fc2(dout)
        h2 = self.prelu(a2)
        a4 = self.out(h2)
        y = self.out_act(a4)
        return y

In [32]:
def Attack(X,Y):
    Net = NeuralNet()
    Loss_Func = nn.BCELoss()
    Optimizer = torch.optim.Adam(Net.parameters(), lr = LR, betas = (0.9, 0.999))
    Torch_Dataset = Dataa.TensorDataset(X,Y)
    Loader = Dataa.DataLoader(
        dataset = Torch_Dataset,
        batch_size = BATCH_SIZE,
        shuffle = True,
        num_workers = 4,
    )
    for Epoch in range(EPOCH):
        cnt = 0
        s = 0
        for step, (Batch_X,Batch_Y) in enumerate(Loader):
            Optimizer.zero_grad()
            B_X = Variable(Batch_X)
            B_Y = Variable(Batch_Y)
            Prediction = Net(B_X)
            Loss = Loss_Func(Prediction.squeeze(),B_Y)
            s = s + Loss.data.item()
            cnt = cnt + 1
            Loss.backward()
            Optimizer.step()
        print(s / cnt)
    return Net
    
    

In [33]:
DataTensor = torch.from_numpy(DataTrain).float()
LabelTensor = torch.from_numpy(LabelTrain).float()
Network = Attack(DataTensor,LabelTensor)

0.5771439375611843
0.5445202512763688
0.5377944417615638
0.52883053623952
0.5150119730081604
0.502000504124786
0.49285336823966264
0.4811704672202115
0.47142542556140093
0.4538855300201059
0.4544689401419242
0.44755798797189345
0.43437821266210475
0.42465571663673457
0.4133352088207882
0.4005848298211233
0.40396378397659105
0.3931547707714741
0.39392606296104277
0.3837767635688398
0.3755260880030162
0.36621894020039886
0.36242619571739465
0.3532877889642783
0.35858480192685577
0.3514748357624804
0.34043269579726937
0.3477782951005827
0.3480149954557419
0.33174603646047307
0.3340433440502221
0.3254505306023275
0.3191419964329609
0.31766173088155086
0.31460421103366176
0.3242412507534027
0.31583655562022284
0.3038435759137592
0.30057318981789866
0.30093487382146983
0.2988256442744585
0.2901618489333521
0.30224935502096373
0.29718956263003193
0.2842427042105469
0.2812114057982985
0.28180506389358595
0.2826674779726996
0.2944898374060884
0.2773452321535321
0.29675351450511067
0.29026584592

In [34]:
DataTestTensor = torch.from_numpy(DataTest).float()
LabelTestTensor = torch.from_numpy(LabelTest).float()

In [35]:
Valid = Network(DataTestTensor).squeeze()

In [36]:
Output = Valid.data.numpy()

In [37]:
Mask = [(1 if val > 0.5 else 0) for val in Output]
Accuracy = sum(Mask == LabelTest) / len(Mask)

In [38]:
print(Accuracy)

0.6993333333333334
