In [5]:
%%time
#user tweet restructuring and concatenation
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', 40) #display entirety of tweet document

#read in single user tweet set - illegitimate or legitimate
def readIn(userTweets):
    tweets = pd.read_csv(userTweets)
    
    #variable removal
    tweets = tweets.drop(['TweetID','CreatedAt'], axis=1) #strip unnecessary variables
    tweets['UserID'] = pd.to_numeric(tweets['UserID'], errors='coerce') #flag NAN entries in UserID column
    tweets = tweets.dropna(subset=['UserID']) #remove row entries with malformed UserID

    #tweet collating
    tweets = tweets.groupby(['UserID'])['Tweet'].apply(list) #groupby userID, pool tweets into single list
    tweetJoin = lambda x: ' '.join(x) #join tweets together into single document
    tweets = tweets.apply(tweetJoin)

    #dataframe reformatting
    tweets = tweets.to_frame() #cast back to frame
    tweets = tweets.reset_index() #reset/adjust index
    
    return tweets

#read in both tweet sets, concat into single df
def readAll():
    cp_tweets = readIn('../../data-sets/honey-pot/raw/content_polluters_tweets.csv')
    lu_tweets = readIn('../../data-sets/honey-pot/raw/legitimate_users_tweets.csv')
    
    #flag as illegitimate/legitimate users
    cp_tweets['UserType'] = 1
    lu_tweets['UserType'] = 0
    
    #merge data frames
    return pd.concat([cp_tweets, lu_tweets])

allTweets = readAll()
print("Finish restructuring and concatenating tweet sets")

  exec(code, glob, local_ns)


Finish restructuring and concatenating tweet sets
CPU times: user 22.4 s, sys: 1.13 s, total: 23.5 s
Wall time: 23.6 s


In [6]:
%%time
#Stopword generation, Lemmatization and Tokenizer object generation
import nltk
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag

# initialize constants, lematizer, punctuation and stopwords
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

# define/return stopwords
def define_sw():
    custom_stop_words = ['–', '\u2019', 'u', '\u201d', '\u201d.',
                         '\u201c', 'say', 'saying', 'sayings',
                         'says', 'us', 'un', '.\"', 'would',
                         'let', '.”', 'said', ',”', 'ax','max',
                         'b8f','g8v','a86','pl','145','ld9','0t',
                         '34u']
    return set(sw.words('english') + custom_stop_words)

# collapse word inflections into single representation
def lemmatize(token, tag):
    tag = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }.get(tag[0], wordnet.NOUN)

    return lemmatizer.lemmatize(token, tag)

# tokenize corpus
def cab_tokenizer(document):
    tokens = []
    sw = define_sw()

    # split the document into sentences
    for sent in sent_tokenize(document):
        # tokenize each sentence
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # If stopword, ignore token and continue
            if token in sw:
                continue

            # Lemmatize the token and add back to the token
            lemma = lemmatize(token, tag)

            # Append lemmatized token to list
            tokens.append(lemma)
    return tokens

print("Load Stemming, Lemmatization, Tokenization Utilty Functions")

Load Stemming, Lemmatization, Tokenization Utilty Functions
CPU times: user 192 ms, sys: 121 ms, total: 313 ms
Wall time: 523 ms


In [12]:
%%time
#Vector fitting, bag of words model utilizing count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

#slice main df for faster prototyping
allTweets = allTweets[0:100]
allTweets = allTweets.reindex(columns=['Tweet'])

#generate vectorizer
def gen_vector():
    return CountVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                   min_df=0.15, max_df=0.85)

#fit count vectoizer to the supplied corpus, return term frequency matrix/feature names
def vectorize(tf_vectorizer, df):
    tf_matrix = tf_vectorizer.fit_transform(allTweets['Tweet'])
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    return tf_matrix, tf_feature_names

#generate and apply count vector to corpus
cv = gen_vector()
tf_matrix, tf_feature_names = vectorize(cv, allTweets)

CPU times: user 26.2 s, sys: 281 ms, total: 26.4 s
Wall time: 26.6 s


In [None]:
%%time
#Topic Distribution generation
from sklearn.decomposition import LatentDirichletAllocation

def create_lda(num_topics,tf_matrix):
    return LatentDirichletAllocation(n_components=num_topics,max_iter=5,
                                     learning_method='online',learning_offset=50,
                                     random_state=0).fit(tf_matrix)

#return normalized topic-word distribution
def create_tw_dist(model):
    normTWDist = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
    
    #check for validity..
    print("Topic Word Dist Overview")
    print(normTWDist.shape)
    print(normTWDist)
    print(sum(normTWDist[0]))
    
    return normTWDist

#return normalized document-topic distribution
def create_dt_dist(model):
    normDTDist = lda.transform(tf)
    
    print("\nDocument Topic Dist Overview")
    print(normDTDist.shape)
    print(normDTDist[0].sum())
    print(normDTDist)
    
    return normDTDist

#fit LDA model using count vector, retrieve distribtuions
lda = create_lda(5, tf_matrix)
norm_tw = create_tw_dist(lda)
norm_dt = create_dt_dist(lda)

print("Models fitted, distributions retrieved")

In [None]:
#calculate topic distribution entropy for each document (user)
import scipy as scipy

#calculate entropy for a given sequence of values
def returnEnt(x):
    return scipy.stats.entropy(x)

#apply function across entire axis (all row entry entropy calculated)
allEnt = np.apply_along_axis(returnEnt, axis=1, arr=norm_dt)

In [20]:
#calculate GOSS/LOSS values for each document (user)
from math import sqrt

sampleDT = norm_dt[:5, :] #portion for prototyping

#calculate GOSS score for a particular user/topic (i/k) combination
def GOSS(topicDist,i,k):
    
    #1.0 return mu(xk) for specific topic, sum topic probabilities for all users, average across all users
    muxk = np.sum(topicDist[:,k]) / topicDist.shape[0]
    
    #2.0 calculate muXK diff - GOSS equation numerator
    muxkDiff = topicDist[i,k] - muxk
    
    #3.0 for all users specific topic probability, sum the squared difference of
    #their relevant topic probability, find the square of this sum
    gossLower = 0
    for userProb in topicDist[:,k]:
        gossLower += (userProb - muxk) ** 2
    
    #3.1 find sqrt of gossLower
    gossLower = sqrt(gossLower)
    
    #4.0 divide gossUpper by gossLower to find final GOSS score for particular topic
    return muxkDiff / gossLower
    
#calculate GOSS scores for all users
def allGOSS(topicDist):  
    allGOSS=[]
    for user in range(topicDist.shape[0]): #each user
        tempGOSS = list(GOSS(topicDist,user,topic) for topic in range(topicDist.shape[1])) #calculate all GOSS scores per topic
        allGOSS.append(tempGOSS) #store all GOSS via nested lists

    return np.array(allGOSS) #return np array

#calculate LOSS score for a particular user/topic (i/k) combination
def LOSS(topicDist,i,k):
    #1.0 return mu(xi) for specific user, sum topic probabilities, return average
    muxi = np.sum(topicDist[i,:]) / topicDist.shape[1]
    
    #2.0 calculate muXI diff - GOSS equation numerator
    muxiDiff = topicDist[i,k] - muxi
    
    #3.0 for all topics (k) and a specific user (i), sum the squared difference of
    #all associated topic probabilities and mu(xi), find the square of this sum
    lossLower = 0
    for userProb in topicDist[i,:]:
        lossLower += (userProb - muxi) ** 2
    
    #3.1 find sqrt of gossLower
    lossLower = sqrt(lossLower)
    
    #4.0 divide gossUpper by gossLower to find final GOSS score for particular topic
    return muxiDiff / lossLower
    
#calculate GOSS scores for all users
def allLOSS(topicDist):  
    allLOSS=[]
    
    for user in range(topicDist.shape[0]): #each user
        tempLOSS = list(LOSS(topicDist,user,topic) for topic in range(topicDist.shape[1])) #calculate all LOSS scores per topic
        allLOSS.append(tempLOSS) #store all LOSS scores for each user via nested lists
    return np.array(allLOSS) #return np array

print(allGOSS(sampleDT))
print(allLOSS(sampleDT))

[[ 0.10974236  0.09999156  0.71152595  0.02700171 -0.22358714]
 [ 0.58049525  0.5740843   0.34687809 -0.87238109  0.89442719]
 [-0.40004423 -0.40429824 -0.35285085  0.28184004 -0.22362552]
 [ 0.3286334   0.34499756 -0.35263383  0.28163541 -0.22356904]
 [-0.61882678 -0.61477518 -0.35291936  0.28190394 -0.22364549]]
[[-0.29616453 -0.29616467  0.03063414  0.85785877 -0.29616371]
 [-0.26920132 -0.2692018  -0.07394533 -0.26919862  0.88154707]
 [-0.22360785 -0.22360702 -0.22360693  0.89442719 -0.22360539]
 [-0.22360788 -0.22360648 -0.22360617  0.89442719 -0.22360666]
 [-0.22360725 -0.22360562 -0.22360734  0.89442719 -0.22360698]]


In [18]:
#pre-classification clustering/feature generation
import pandas as pd
import scipy as scipy

# #convert sparse matrix to pandas df for intuition
# pdTD = pd.DataFrame(tf.toarray(), columns=tf_vectorizer.get_feature_names())
# pdTD.head(5)

#calculate entropy for a given sequence of values
def returnEnt(x):
    return scipy.stats.entropy(x)

#retrieve entropy across all row entries
def appendEnt(TDVec):
    return np.apply_along_axis(returnEnt, axis=1, arr=TDVec)

#calculate unique values for document
def returnUni(x):
    return np.count_nonzero(x==1)

#retrieve for all docs, append to feature matrix
def appendUni(featDF,TDVec):
    tempUni = np.apply_along_axis(returnUni, axis=1, arr=TDVec)
    
x = np.array([[1,2,3,1,1],
              [1,2,3,1,1]])

#apply function across entire axis (all row entry entropy calculated)
# allEnt = np.apply_along_axis(returnEnt, axis=1, arr=normDTDist)
a = np.apply_along_axis(returnUni, 1, scipy.sparse.csr_matrix.todense(tf)) #currently using dense representation.. may run into issues
print(a)
print(type(tf))
# b = np.apply_along_axis(returnEnt, axis=1, arr=tf)

# r = np.apply_along_axis(returnUni,axis=1, arr=x)

[173 172 183 188  76 146 189 218 171 238  53 180 218 225 195 226  23 261
 237 172 179  69 242 202 185 141 183 180 138 203  32 175 174 229 107 236
 189 166 213 237 145 187  32 178 124 152 151 117 238 149 168 123 193 180
 231   6  20  84 174 210  92  66 212 187 172 173 210 201 152 140 217 119
 238 167  63   0 187 114 154 197 113 218 190  70 231 165 245  65 205 119
 150 135 153 210 217 204 197 191  11 195]
(100,)
<class 'scipy.sparse.csr.csr_matrix'>


In [25]:
import json

with open('../app/input_parameters.json') as data_file:    
    data = json.load(data_file)
print(data.keys())

dict_keys(['feature_engineering', 'initial_clustering', 'final_classification'])
