In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import nltk
import re
from textblob import TextBlob

In [2]:
def CountHashtags(data):
    """
    count words that start with # given a data string
    """
    assert isinstance(data, str)
        
    count = len([s for s in data.split() if s.startswith('#')])
    return count
    

def CountMentions(data):
    """
    count words that start with @ given a data string
    """
    assert isinstance(data, str)
    
    count = len([s for s in data.split() if s.startswith('@')])
    return count

def RemoveHashtags(data):
    """
    Removes hashtags from a given string
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    hashtag_regex = '#[A-Za-z0-9]+'
    
    data = ' '.join(re.sub(hashtag_regex, ' ', data).split())
    
    return data
    

def RemoveMentions(data):
    """
    Removes mentions from a given string
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    mention_regex = '@[A-Za-z0-9]+'
    
    data = ' '.join(re.sub(mention_regex, ' ', data).split())
    
    return data
    

def CountStopWords(data, stop_words):
    """
    Given a text and a list of stop words, return count of step words in text
    """
    assert isinstance(data, str)
    assert isinstance(stop_words, set)
    
    count = len([s for s in data.split() if s in stop_words])
    return count

def RemoveStopWords(data, stop_words):
    """
    Given a test and a list of stop words, remove stop words from text
    """
    assert isinstance(data, str)
    assert isinstance(stop_words, set)
    
    data = ' '.join([s for s in data.split() if s not in stop_words])
    return data

def GetWordCount(data):
    """
    count number of words in a given text
    """
    assert isinstance(data, str)
    
    count = len([s for s in data.split()])
    return count

def GetCharCount(data):
    """
    count number of characters in a given text
    """
    assert isinstance(data, str)
    
    return len(data)

def GetAvgWordLength(data):
    """
    Given a text, return average word length
    """
    assert isinstance(data, str)
    
    words = data.split()
    
    Len = 0
    for w in words:
        Len += len(w)
    
    Ans = int(Len / len(words))
    return Ans

def GetNumericDigitsCount(data):
    """
    Given a text, count number of numerical digits (not imbedded)
    """
    assert isinstance(data, str)
    
    count = len([s for s in data.split() if s.isdigit()])
    return count

def CountContractions(data, contractions):
    """
    Given a text and map of contractions, return count of contractions in text
    
    Make sure all characters are lowercase
    """
    assert isinstance(data, str)
    assert isinstance(contractions, dict)
    
    count = len([s for s in data.split() if s in contractions])
    return count

def ExpandContractions(data, contractions):
    """
    Given a text, and a map of contractions, replace contraction in the given text
    
    Make sure all character are lowercase
    """
    assert isinstance(data, str)
    assert isinstance(contractions, dict)
    
    for key in contractions:
        value = contractions[key]
        data = data.replace(key, value)
    
    return data

def GetEmailCount(data):
    """
    Given a text, count number of emails 
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    email_regex = '([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
    
    count = len(re.findall(email_regex, data))
    return count

def RemoveEmails(data):
    """
    Given a text, remove all emails from it
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    email_regex = email_regex = '([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
        
    data = re.sub(email_regex, '', data)
    
    return data

def ExtractEmails(data):
    """
    Given a text, return all emails in this text
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    email_regex = '([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
    
    emails = re.findall(email_regex, data)
    
    return emails

def GetUrlCount(data):
    """
    Given a text, count Urls it contains
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    url_regex = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
    
    count = len(re.findall(url_regex, data))
    
    return count

def RemoveUrls(data):
    """
    Given a text, remove Urls it contains
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    url_regex = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
    
    data = re.sub(url_regex, '', data)
    
    return data

def ExtractUrls(data):
    """
    Given a text, return all Urls it contains
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    url_regex = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
    
    urls = re.findall(url_regex, data)
    
    return urls

def RemoveSpecialChars(data):
    """
    Given a text, remove special (non-alphanumeric) characters
    
    Assuming re library is imported
    """
    assert isinstance(data, str)
    
    special_regex = '[^A-Z a-z 0-9-]+'
    
    data = re.sub(special_regex, '', data)
    
    return data

def RemoveMultipleSpaces(data):
    """
    Given a text, remove multiple whistepaces
    """
    assert isinstance(data, str)
    
    data = ' '.join(data.split())
    
    return data

def RemoveHTMLTags(data):
    """
    Given a text remove any existing html tags
    
    Assuming BeautifulSoup is imported from bs4
    """
    assert isinstance(data, str)
    
    data = BeautifulSoup(data, 'lxml').getText()
    
    return data

def RemoveAccentedChars(data):
    """
    Given a text, remove accented chars
    """
    assert isinstance(data, str)
    
    data = unicodedata.normalize('NFKD', data).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return data

def CorrectSpelling(data):
    """
    Given a text, correct spelling of words using textblob
    
    Assuming TextBlob is imported from textblob
    """
    assert isinstance(data, str)
    data = TextBlob(data).correct()
    return data
    

def RemovePunctuations(data):
    """
    Given a text, remove its punctuation
    
    Assuming re is imported
    """
    assert isinstance(data, str)
    
    punc_regex = '[\.\,\!\?\:\;\-\=]'
    
    data = re.sub(punc_regex, '', data)
    return data

contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"btw": "by the way",
"cuz": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"gonna": "going to",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [3]:
def CountSentiments(sentiments):
    cnt0 = 0
    cnt1 = 0
    for sent in sentiments:
        if sent == 1:
            cnt1 += 1
        else:
            cnt0 += 1
            
    print(cnt0,cnt1)

In [4]:
Columns = ['sentiment', 'IDs', 'date', 'flag', 'user', 'text']
RawData = pd.read_csv('twitter_data.csv', encoding = 'ISO-8859-1', names = Columns)
RawData.head(10)

Unnamed: 0,sentiment,IDs,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [5]:
RawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
sentiment    1600000 non-null int64
IDs          1600000 non-null int64
date         1600000 non-null object
flag         1600000 non-null object
user         1600000 non-null object
text         1600000 non-null object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [6]:
# drop unnecessary columns
RawData = RawData.drop(columns = ['IDs','date','flag','user'])

In [7]:
# define constants
N = 10000
K = 300

In [8]:
# shuffle data
RawData = RawData.sample(frac = 1).reset_index(drop = True)
RawData.head()

Unnamed: 0,sentiment,text
0,4,@decryption Excellent - Will you be in this W...
1,4,@hokeypokeyjones Get 100 followers a day using...
2,4,"Somehow, i think any David Bowie song would go..."
3,0,Has had a great weekend and is now back in wor...
4,4,@BarelySeeAtAll I love you! I'm sure that Ren...


In [9]:
# take a subset of the data points
Data = RawData.iloc[0:N]

In [10]:
# normalize sentiment
Data.sentiment = Data.sentiment.apply(lambda x : (1 if x == 4 else 0))
Data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,sentiment,text
0,1,@decryption Excellent - Will you be in this W...
1,1,@hokeypokeyjones Get 100 followers a day using...
2,1,"Somehow, i think any David Bowie song would go..."
3,0,Has had a great weekend and is now back in wor...
4,1,@BarelySeeAtAll I love you! I'm sure that Ren...


In [11]:
# turn to lowercase
Data.text = Data.text.apply(lambda x : x.lower())

In [12]:
FullText = ' '.join([s for s in Data.text])

In [13]:
from spacy.lang.en.stop_words import STOP_WORDS

print(f"Number of Hashtags: {CountHashtags(FullText)}")
print(f"Number of Mentions: {CountMentions(FullText)}")
print(f"Number of Stop Words: {CountStopWords(FullText,STOP_WORDS)}")
print(f"Number of Emails: {GetEmailCount(FullText)}")
print(f"Number of Urls: {GetUrlCount(FullText)}")
print(f"Number of Words: {GetWordCount(FullText)}")
print(f"Number of chars: {GetCharCount(FullText)}")
print(f"Stop word percentage: {(CountStopWords(FullText,STOP_WORDS) / GetWordCount(FullText) * 100)}")

Number of Hashtags: 283
Number of Mentions: 4971
Number of Stop Words: 56619
Number of Emails: 4
Number of Urls: 459
Number of Words: 132364
Number of chars: 754153
Stop word percentage: 42.77522589223656


In [14]:
print(f"Average Word Length: {GetAvgWordLength(FullText)}")

Average Word Length: 4


In [15]:
from nltk.corpus import stopwords

stop_words_nltk = set(stopwords.words('english'))
print(f"Stop Words for nltk:{CountStopWords(FullText,stop_words_nltk)}")

Stop Words for nltk:51198


In [16]:
Data.text = Data.text.apply(lambda x : RemoveHashtags(x))
Data.text = Data.text.apply(lambda x : RemoveMentions(x))
Data.text = Data.text.apply(lambda x : ExpandContractions(x, contractions))
#Data.text = Data.text.apply(lambda x : RemoveStopWords(x, stop_words_nltk))
Data.text = Data.text.apply(lambda x : RemoveEmails(x))
Data.text = Data.text.apply(lambda x : RemoveUrls(x))
Data.text = Data.text.apply(lambda x : RemoveSpecialChars(x))
Data.text = Data.text.apply(lambda x : RemoveMultipleSpaces(x))
Data.text = Data.text.apply(lambda x : RemovePunctuations(x))

In [17]:
Data.head(20)

Unnamed: 0,sentiment,text
0,1,excellent will you be in this weekends edition
1,1,get 100 followers a day using wwwtweeterfollow...
2,1,somehow i think any david bowie song would go ...
3,0,has had a great weekend and is now back in wor...
4,1,i love you i am sure that reno loves you
5,1,quotstatistical mechanics the theory of wiggl...
6,0,honestly i do think i might have a problem i p...
7,1,heheh hey whatever youre comfortable with do ...
8,0,im football illiterate
9,1,good morning new followers time for my facesiz...


In [18]:
nlp = spacy.load('en_core_web_md')

In [19]:
def Vectorize(x):
    x = x.lower()
    document = nlp(x)
    VecSum = np.zeros(shape = (1,K))
    count = 0
    for word in document:
        if word.has_vector:
            VecSum += word.vector.T
            count += 1
    if count == 0:
        return VecSum
    return (VecSum / count)

In [20]:
DataMatrix = np.concatenate([Vectorize(text) for text in Data.text])
print(DataMatrix.shape)
print(DataMatrix)

(10000, 300)
[[-5.08024981e-02  1.57572871e-01  2.56343745e-02 ... -1.50913117e-01
   1.49098399e-02 -6.73260365e-05]
 [-9.93935556e-02  1.31370086e-01 -2.34986665e-01 ... -8.78941647e-02
   9.42981668e-02  1.61424389e-01]
 [-4.33379243e-02  2.11437843e-01 -1.47872621e-01 ... -7.36254631e-02
   5.44357681e-02  3.29791536e-01]
 ...
 [ 1.02721431e-02  2.90677138e-01 -1.40037101e-01 ... -1.08468000e-01
  -8.72590026e-02  1.27690667e-01]
 [ 1.42092832e-02  1.97957242e-01 -2.84860964e-01 ... -5.25510386e-02
  -1.52867605e-02  1.34494128e-01]
 [-2.19953119e-02  1.72259445e-01 -1.93147374e-01 ...  4.92433300e-03
   5.32588513e-02  1.74703232e-01]]


In [21]:
# store labels in numpy array
Labels = Data['sentiment'].values

In [22]:
from sklearn.model_selection import train_test_split

DataTrain, DataTest, LabelTrain, LabelTest = train_test_split(DataMatrix, Labels, test_size = 0.1, random_state = 1)

In [23]:
from sklearn.svm import SVC

SVM_Model = SVC(kernel = 'linear')
SVM_Model.fit(DataTrain, LabelTrain)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
print(SVM_Model.score(DataTest,LabelTest))

0.73


In [25]:
import torch.nn as nn
import torch
from torch.autograd import Variable
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn import preprocessing
import torch.nn.functional as F
import torch.utils.data as Dataa
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

H1 = 100
H2 = 100
OP = 1
LR = 0.001
BATCH_SIZE = 32
EPOCH = 100


In [26]:
class NeuralNet(nn.Module):

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(K, H1)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(H1, H2)
        self.prelu = nn.PReLU(1)
        self.out = nn.Linear(H2, OP)
        self.out_act = nn.Sigmoid()
        
    def forward(self, input_):
        a1 = self.fc1(input_)
        h1 = self.relu1(a1)
        dout = self.dout(h1)
        a2 = self.fc2(dout)
        h2 = self.prelu(a2)
        a4 = self.out(h2)
        y = self.out_act(a4)
        return y

In [27]:
def Attack(X,Y):
    Net = NeuralNet()
    Loss_Func = nn.BCELoss()
    Optimizer = torch.optim.Adam(Net.parameters(), lr = LR, betas = (0.9, 0.999))
    Torch_Dataset = Dataa.TensorDataset(X,Y)
    Loader = Dataa.DataLoader(
        dataset = Torch_Dataset,
        batch_size = BATCH_SIZE,
        shuffle = True,
        num_workers = 4,
    )
    for Epoch in range(EPOCH):
        cnt = 0
        s = 0
        for step, (Batch_X,Batch_Y) in enumerate(Loader):
            Optimizer.zero_grad()
            B_X = Variable(Batch_X)
            B_Y = Variable(Batch_Y)
            Prediction = Net(B_X)
            Loss = Loss_Func(Prediction.squeeze(),B_Y)
            s = s + Loss.data.item()
            cnt = cnt + 1
            Loss.backward()
            Optimizer.step()
        print(s / cnt)
    return Net
    
    

In [28]:
DataTensor = torch.from_numpy(DataTrain).float()
LabelTensor = torch.from_numpy(LabelTrain).float()
Network = Attack(DataTensor,LabelTensor)

0.5727913952465599
0.5283152029869405
0.5129052806407848
0.503301259050978
0.4889334204560476
0.4790828196292228
0.46563216536603075
0.45349914089162296
0.44014260843924596
0.43403868419481506
0.4207893597318771
0.40985356292403335
0.40633054130466273
0.3899292562764587
0.3788814920377224
0.3701172758501472
0.3654082367289151
0.35705000500307016
0.34057804833807
0.33067188773911893
0.32433915767052496
0.31450869581589463
0.3083854521102939
0.30008496090452724
0.2934312970238797
0.2840999811095126
0.27070400969567876
0.2667044886249177
0.25998048357507014
0.2569349026901925
0.24514316764812097
0.2390230536751502
0.23990552277958138
0.230963454628033
0.22715159031357748
0.216809527803186
0.21771765859625863
0.20950479443508682
0.19402031268898054
0.19178557027369103
0.20118439887115297
0.1924921896123717
0.18685515933002986
0.17738680529626125
0.18042389102650028
0.18695806612835286
0.1747049019004859
0.1689807542249666
0.1640797345139456
0.1593276139064725
0.1539257605566729
0.160072116

In [29]:
DataTestTensor = torch.from_numpy(DataTest).float()
LabelTestTensor = torch.from_numpy(LabelTest).float()

In [30]:
Valid = Network(DataTestTensor).squeeze()

In [31]:
Output = Valid.data.numpy()

In [32]:
Mask = [(1 if val > 0.5 else 0) for val in Output]
Accuracy = sum(Mask == LabelTest) / len(Mask)

In [33]:
print(Accuracy)

0.694


In [34]:
import time

append_test = np.zeros(1);
concat_test = np.zeros(1);
print(append_test.shape)

append_start = time.time();

for i in range(100000):
    append_test = np.append(append_test, [5], axis = 0)

append_time = time.time() - append_start

concat_start = time.time()

concat_test = np.concatenate([[el] for el in range(100000)])

concat_time = time.time() - concat_start

print(append_time)
print(concat_time)

print(append_test)
print(concat_test)

print(append_test.shape)
print(concat_test.shape)


(1,)
2.2415590286254883
0.2165079116821289
[0. 5. 5. ... 5. 5. 5.]
[    0     1     2 ... 99997 99998 99999]
(100001,)
(100000,)
