In [1]:
import scipy
import pandas
import numpy
import sklearn
import nltk

print(scipy.__version__)
print(pandas.__version__)
print(numpy.__version__)
print(sklearn.__version__)
print(nltk.__version__)

0.19.1


In [1]:
%%time
# raw data preprocessing

from scipy.io import arff
import pandas as pd
import numpy as np

# define import/export directories
dirs = {'static_import': '../../data_sets/honey_pot/preprocessed/FinalDataFull.csv.arff',
       'static_export': '../../data_sets/honey_pot/final_features/static_features.csv',
       'dynamic_imports': ['../../data_sets/honey_pot/raw/cp_tweets.csv',
                             '../../data_sets/honey_pot/raw/lu_tweets.csv'],
       'dynamic_export':'../../data_sets/honey_pot/preprocessed/dynamic_features_intermediate.csv'}

#preprocess all static features
def preprocess_static_features(import_path):
    data = arff.loadarff(import_path)
    df = pd.DataFrame(data[0])
    
    #correct for usertype boolean type
    df['UserType'] = df['UserType'].astype(int)
    
    #drop any duplicate entries
    return df.drop_duplicates(['UserID'])

#preprocess all dynamic features
def preprocess_dynamic_features(import_paths):
    cp_tweets = preprocess_tweet_set(import_paths[0])
    lu_tweets = preprocess_tweet_set(import_paths[1])
    
    #ensure cp and lu tweet ID's are complimentary - disjoint cp/lu users groups
    cp_tweets_set = cp_tweets.loc[~cp_tweets['UserID'].isin(lu_tweets['UserID'])] #negated match
    lu_tweets_set = lu_tweets.loc[~lu_tweets['UserID'].isin(cp_tweets['UserID'])]
    
    #flag as illegitimate/legitimate users
    cp_tweets_set['UserType'] = 1
    lu_tweets_set['UserType'] = 0
    
    #merge data frames
    return pd.concat([cp_tweets_set, lu_tweets_set])

#read in single user tweet set
def preprocess_tweet_set(tweets_path):
    tweets = pd.read_csv(tweets_path)
    
    #variable removal
    tweets = tweets.drop(['TweetID','CreatedAt'], axis=1) #strip unnecessary variables
    tweets['UserID'] = pd.to_numeric(tweets['UserID'], errors='coerce') #flag NAN entries in UserID column
    tweets = tweets.dropna(subset=['UserID']) #remove row entries with malformed UserID

    #tweet collating
    tweets = tweets.groupby(['UserID'])['Tweet'].apply(list) #groupby userID, pool tweets into single list
    tweetJoin = lambda x: ' '.join(x) #join tweets together into single document
    tweets = tweets.apply(tweetJoin)

    #dataframe reformatting
    tweets = tweets.to_frame() #cast back to frame
    tweets = tweets.reset_index() #reset/adjust index
    
    return tweets

#find union of two dataframes based upon given value
def square_frames(df_a, df_b):
    df_a_set = df_a.loc[df_a['UserID'].isin(df_b['UserID'])] #ensure for match
    df_b_set = df_b.loc[df_b['UserID'].isin(df_a['UserID'])]
    
    df_a_set = df_a_set.sample(n=100, axis=0) #return small sample for prototyping speed
    df_b_set = df_b_set.loc[df_b['UserID'].isin(df_a_set['UserID'])] #match user type
    
    return df_a_set, df_b_set #retain original frame housing

#export frames
def export_frames(frames, locations):
    for frame,location in zip(frames,locations):
        export_csv(frame,location)
    
# preprocess static and dynamic dataframes
static_df = preprocess_static_features(dirs['static_import'])
dynamic_df = preprocess_dynamic_features(dirs['dynamic_imports'])

#square frames, export as intermediate csv files
static_df,dynamic_df = square_frames(static_df, dynamic_df)
export_frames([static_df,dynamic_df], [dirs['static_export'],dirs['dynamic_export']])

  exec(code, glob, local_ns)


CPU times: user 27.4 s, sys: 3.04 s, total: 30.4 s
Wall time: 33.1 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [2]:
%%time
#Stopword generation, Lemmatization, Tokenizer object generation
#Vector fitting => bag of words model utilizing count vectorizer
import nltk
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag
from sklearn.feature_extraction.text import CountVectorizer

# initialize constants, lematizer, punctuation and stopwords
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

# define/return stopwords
def define_sw():
    custom_stop_words = ['–', '\u2019', 'u', '\u201d', '\u201d.',
                         '\u201c', 'say', 'saying', 'sayings',
                         'says', 'us', 'un', '.\"', 'would',
                         'let', '.”', 'said', ',”', 'ax','max',
                         'b8f','g8v','a86','pl','145','ld9','0t',
                         '34u']
    return set(sw.words('english') + custom_stop_words)

# collapse word inflections into single representation
def lemmatize(token, tag):
    tag = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }.get(tag[0], wordnet.NOUN)

    return lemmatizer.lemmatize(token, tag)

# tokenize corpus
def cab_tokenizer(document):
    tokens = []
    sw = define_sw()

    # split the document into sentences
    for sent in sent_tokenize(document):
        # tokenize each sentence
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # If stopword, ignore token and continue
            if token in sw:
                continue

            # Lemmatize the token and add back to the token
            lemma = lemmatize(token, tag)

            # Append lemmatized token to list
            tokens.append(lemma)
    return tokens

#generate term frequency vector
def generate_vector():
    return CountVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                   min_df=0.15, max_df=0.85)

#fit count vectoizer to the supplied corpus, return term frequency matrix/feature names
def vectorize(tf_vectorizer, df):
    df = df.reindex(columns=['Tweet']) #reindex on tweet
    
    tf_matrix = tf_vectorizer.fit_transform(df['Tweet'])
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    return tf_matrix, tf_feature_names

#generate and apply count vector to corpus
cv = generate_vector()
tf_matrix, tf_feature_names = vectorize(cv, dynamic_df)

print("done")

done
CPU times: user 22.3 s, sys: 457 ms, total: 22.7 s
Wall time: 23.2 s


In [3]:
%%time
#Dynamic Feature Generation - Document/Topic entropy, GOSS/LOSS scores

from sklearn.decomposition import LatentDirichletAllocation
import scipy as scipy
from math import sqrt

# create lda model
def create_lda(num_topics,tf_matrix):
    return LatentDirichletAllocation(n_components=num_topics,max_iter=5,
                                     learning_method='online',learning_offset=50,
                                     random_state=0).fit(tf_matrix)

#return normalized topic-word distribution
def create_tw_dist(model):
    normTWDist = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
     
    return normTWDist

#return normalized document-topic distribution
def create_dt_dist(model, tf_matrix):
    normDTDist = model.transform(tf_matrix)
    
    return normDTDist

#calculate entropy for a given sequence of values
def entropy_single(x):
    return scipy.stats.entropy(x)

#calculate entropy for an entire document-topic distribution
def entropy_all(dt_dist):
    np_entropy = np.apply_along_axis(entropy_single, axis=1, arr=dt_dist)
    
    return pd.DataFrame(np_entropy, columns=['dt_entropy'])

#GOSS/LOSS features
#calculate GOSS score for a single particular user/topic (i/k) combination
def single_goss(topic_dist,i,k):
    
    #1.0 return mu(xk) for specific topic, sum topic probabilities for all users, average across all users
    mu_xk = np.sum(topic_dist[:,k]) / topic_dist.shape[0]
    
    #2.0 GOSS equation numerator
    goss_numerator = topic_dist[i,k] - mu_xk
    
    #3.0 for all users specific topic probability:
    # - sum the squared difference of their relevant topic probability
    # - find the square of this sum
    goss_denominator = 0
    for user_prob in topic_dist[:,k]:
        goss_denominator += (user_prob - mu_xk) ** 2
    
    #3.1 find sqrt of goss_denominator
    goss_denominator = sqrt(goss_denominator)
    
    #4.0 divide numerator/denominator to find final GOSS score for user/topic combination
    return goss_numerator / goss_denominator
    
#calculate GOSS scores for a particular topic distribution
def all_goss(topic_dist):  
    goss=[]
    topics = range(topic_dist.shape[1])
    topic_labels = list('goss' + str(each) for each in topics)
    
    for user in range(topic_dist.shape[0]): #each user
        temp_goss = list(single_goss(topic_dist,user,topic) for topic in topics) #calculate all GOSS scores per topic
        goss.append(temp_goss) #store all GOSS via nested lists

    np_goss = np.array(goss) #recast as np array..
    return pd.DataFrame(goss, columns=topic_labels) #and then to pandas df..

#calculate loss score for a particular user/topic (i/k) combination
def single_loss(topic_dist,i,k):
    #1.0 return mu(xi) for specific user, sum topic probabilities, return average
    mu_xi = np.sum(topic_dist[i,:]) / topic_dist.shape[1]
    
    #2.0 calculate muXI diff - GOSS equation numerator
    loss_numerator = topic_dist[i,k] - mu_xi
    
    #3.0 for all topics (k) and a specific user (i):
    # - sum the squared difference of all associated topic probabilities and mu(xi)
    # - find the square of this sum
    loss_denominator = 0
    for user_prob in topic_dist[i,:]:
        loss_denominator += (user_prob - mu_xi) ** 2
    
    #3.1 find sqrt of loss denominator
    loss_denominator = sqrt(loss_denominator)
    
    #4.0 divide loss numerator by loss denominator to find loss score for specific user
    return loss_numerator / loss_denominator
    
#calculate GOSS scores for all users
def all_loss(topic_dist):  
    loss=[]
    topics = range(topic_dist.shape[1])
    topic_labels = list('loss' + str(each) for each in topics)
    
    for user in range(topic_dist.shape[0]): #each user
        temp_loss = list(single_loss(topic_dist,user,topic) for topic in topics) #calculate all loss scores per topic
        loss.append(temp_loss) #store all loss scores for each user via nested lists
    
    np_loss = np.array(loss) #cast to np array..
    return pd.DataFrame(np_loss, columns=topic_labels) #and finally to pandas df..

def generate_dynamic_features(tf_matrix, lda_topics):
    #fit LDA model using count vector
    lda = create_lda(lda_topics, tf_matrix)
    
    #retrieve document/topic distribution
    dt_dist = create_dt_dist(lda, tf_matrix)
    
    #retrieve entropy for topic distribution
    dt_entropy = entropy_all(dt_dist)
    
    #retrieve GOSS and LOSS scores
    goss_df = all_goss(dt_dist)
    loss_df = all_loss(dt_dist)
    
    #glue new features together into single df
    dynamic_features = pd.concat([dt_entropy, goss_df, loss_df], axis=1)
    return export_csv(dynamic_features,'../../data_sets/honey_pot/final_features/dynamic_features.csv')
    
df = generate_dynamic_features(tf_matrix, 5)

print(df.head(50))

    dt_entropy     goss0     goss1     goss2     goss3     goss4     loss0  \
0     0.693860  0.058200  0.063699 -0.072929 -0.011895 -0.054851  0.627644   
1     0.012444  0.182154 -0.068166 -0.072883 -0.011720 -0.054798  0.894427   
2     0.007742 -0.091207 -0.068207 -0.072924 -0.011877  0.277282 -0.223606   
3     0.440353  0.141404 -0.067990 -0.072710 -0.011064 -0.005917  0.881105   
4     0.611038 -0.010845  0.137057 -0.072924 -0.011876 -0.054845  0.151183   
5     0.751819  0.110673 -0.020287 -0.045272 -0.011869 -0.054843  0.871325   
6     0.009398  0.182257 -0.068193 -0.072911 -0.011822 -0.054828  0.894427   
7     1.203447 -0.043998  0.085158 -0.020264 -0.011844 -0.016669 -0.069472   
8     0.671484  0.013153 -0.068192 -0.072909 -0.011819  0.150507  0.317059   
9     0.890678 -0.091043  0.094644 -0.052922 -0.011229  0.067494 -0.395252   
10    0.750238  0.112128 -0.068154 -0.042278 -0.011677 -0.005460  0.874273   
11    0.006640 -0.091216  0.222495 -0.072933 -0.011911 -0.054856

In [4]:
#preliminary clustering
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# define import directories
dirs = {'static_features': '../../data_sets/honey_pot/final_features/static_features.csv',
       'dynamic_features':'../../data_sets/honey_pot/final_features/dynamic_features.csv'}

#read in and join static and dynamic features 
def consolidate_features(static_features, dynamic_features):
    static = pd.read_csv(static_features, index_col=0).reset_index(drop=True)
    dynamic = pd.read_csv(dynamic_features, index_col=0)
    
    return pd.concat([static, dynamic], axis=1) #join features along axis
    
#customize features contained within df
def choose_features(df, feature_list):
    return df.loc[:, feature_list]

#scale features to ensure clustering is not skewed
def scale_features(df):
    scaler = StandardScaler()
    df = df.as_matrix()
    return scaler.fit_transform(df)

#perform clustering on df_matrix, append resultant cluster labels to original df
def kmeans(df, df_matrix, n_clusters):
    km = KMeans(n_clusters=n_clusters, random_state=42).fit(df_matrix) #create/fit kmeans to matrix
    
    df['cluster'] = km.labels_ #augment cluster result as attribute
#     df['user_id'] = df.index #ensuer user ID retained before segmentation
    
    return segment_df(df, n_clusters)

#segment df based upon cluster allocation
def segment_df(df, n_clusters):
    segmented_df = [] #contain segmented df as list
    
    #segment df based upon cluster
    for cluster in range(n_clusters):
        segmented_df.append(df.loc[df['cluster'] == cluster])

    return segmented_df

#perform preliminary clustering, return list of cluster-based df's
def distil_clusters():
    all_df = consolidate_features(dirs['static_features'], dirs['dynamic_features'])
    select_df = choose_features(all_df, ['dt_entropy','totalNumOfUniqWords'])
    
    df_matrix = scale_features(select_df)
    return kmeans(all_df, df_matrix, 3)

all_df = distil_clusters()

In [41]:
import os
import glob

#export single dataframe
def export_frame(df, file_path):
    df.to_csv(file_path)
    return df

#clear parent directory of all files, export clustered frames
def export_frames_destructive(frames, folder_location):
    files = glob.glob('./test/*') #clear holding dir
    for f in files:
        print(f)
        os.remove(f)
    
    for idx,df in enumerate(frames):
        f_name = folder_location + 'cluster' + str(idx) + '.csv'
        export_frame(df, f_name)

export_frames_destructive(all_df, './test/')

# for df in all_df:
#     print(type(df))

#apply final classification
# print(all_df[0].columns)

# for each in all_df:
#     print(each['UserType'].astype(bool).sum(axis=0)) #retrieve number of non-zeros in usertype column

./test/cluster0.csv
./test/cluster1.csv
./test/cluster2.csv


In [28]:
l = [1,2,3,4]

for df, val in enumerate(l):
    print('cluster' + str(val) + '.csv')

cluster1.csv
cluster2.csv
cluster3.csv
cluster4.csv


In [6]:
%%time
#utility functions
from scipy.io import arff
import pandas as pd
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#entire x/y partition for cross fold validation
def course_split(df):
    return df.drop(['UserType'], axis=1), df['UserType']

#x/y splits further partitioned into train/test
def fine_split(df):
    #partition data
    y = df['UserType']
    X = df.drop(['UserType'], axis=1)
    X_mat = X.as_matrix()
    return train_test_split(X_mat, y, test_size=0.4, random_state=42, stratify=y)

#evaluate models performance using kfold cross validation
def kfold(model, model_name, X, y):
    cv = ShuffleSplit(n_splits=10, test_size=0.4, random_state=42) #define cv iterator parameters (stratify?)
    scores = cross_val_score(model, X, y, cv=cv)
    
    #retrieve trained model accuracy using cross fold validation score - using all data 
    print("{0} Accuracy: {1:.2f} (+/- {2:.2f})".format(model_name, scores.mean(), scores.std() * 2))
    
#evaluate models performance using classification report and confusion matrix
def metrics(model, X_train, X_test, y_train, y_test):
    #classification report and confusion matrix - using train/test partitions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Classification report:\n", classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    
print('loaded utilities')

loaded utilities
CPU times: user 167 µs, sys: 11 µs, total: 178 µs
Wall time: 186 µs


In [14]:
%%time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC

#generate models for a given df
def generate_models(df, df_name):
    print('Generating models for {}'.format(df_name))
    
    models = [DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), LinearSVC()]
    model_names = ["Decision Tree", "Random Forest", "Adaboost", "LinearSVC"]

    #partion dataframe
    X,y = course_split(df)
    X_train,X_test,y_train,y_test = fine_split(df)

    #evaluate all models
    for model, model_name in zip(models, model_names):
        kfold(model, model_name, X, y)
        metrics(model,X_train,X_test,y_train,y_test)
    
#generate models for a list of dataframes
def generate_all(df_list):
    for df in df_list:
        generate_models(df, 'test_name')
    

CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 19.8 µs


In [42]:
a = []
b = [1,2,3,4]

def c(d):
    return d + 1

for each in b:
    a.append(c(each))
    
print(a)

[2, 3, 4, 5]


In [43]:
generate_all(all_df)

Generating models for test_name
Decision Tree Accuracy: 0.79 (+/- 0.22)
Classification report:
              precision    recall  f1-score   support

          0       0.71      0.91      0.80        11
          1       0.50      0.20      0.29         5

avg / total       0.65      0.69      0.64        16

Confusion matrix:
 [[10  1]
 [ 4  1]]
Random Forest Accuracy: 0.79 (+/- 0.15)
Classification report:
              precision    recall  f1-score   support

          0       0.73      1.00      0.85        11
          1       1.00      0.20      0.33         5

avg / total       0.82      0.75      0.69        16

Confusion matrix:
 [[11  0]
 [ 4  1]]
Adaboost Accuracy: 0.80 (+/- 0.18)
Classification report:
              precision    recall  f1-score   support

          0       0.71      0.91      0.80        11
          1       0.50      0.20      0.29         5

avg / total       0.65      0.69      0.64        16

Confusion matrix:
 [[10  1]
 [ 4  1]]
LinearSVC Accuracy: 0.

  'precision', 'predicted', average, warn_for)


Random Forest Accuracy: 0.59 (+/- 0.31)
Classification report:
              precision    recall  f1-score   support

          0       0.33      0.25      0.29         4
          1       0.67      0.75      0.71         8

avg / total       0.56      0.58      0.57        12

Confusion matrix:
 [[1 3]
 [2 6]]
Adaboost Accuracy: 0.63 (+/- 0.23)
Classification report:
              precision    recall  f1-score   support

          0       0.50      0.25      0.33         4
          1       0.70      0.88      0.78         8

avg / total       0.63      0.67      0.63        12

Confusion matrix:
 [[1 3]
 [1 7]]
LinearSVC Accuracy: 0.47 (+/- 0.40)
Classification report:
              precision    recall  f1-score   support

          0       0.00      0.00      0.00         4
          1       0.67      1.00      0.80         8

avg / total       0.44      0.67      0.53        12

Confusion matrix:
 [[0 4]
 [0 8]]
Generating models for test_name
Decision Tree Accuracy: 0.75 (+/- 0.30

  'precision', 'predicted', average, warn_for)


Random Forest Accuracy: 0.72 (+/- 0.21)
Classification report:
              precision    recall  f1-score   support

          0       0.33      0.25      0.29         4
          1       0.70      0.78      0.74         9

avg / total       0.59      0.62      0.60        13

Confusion matrix:
 [[1 3]
 [2 7]]
Adaboost Accuracy: 0.75 (+/- 0.22)
Classification report:
              precision    recall  f1-score   support

          0       1.00      0.50      0.67         4
          1       0.82      1.00      0.90         9

avg / total       0.87      0.85      0.83        13

Confusion matrix:
 [[2 2]
 [0 9]]
LinearSVC Accuracy: 0.55 (+/- 0.36)
Classification report:
              precision    recall  f1-score   support

          0       0.00      0.00      0.00         4
          1       0.69      1.00      0.82         9

avg / total       0.48      0.69      0.57        13

Confusion matrix:
 [[0 4]
 [0 9]]


  'precision', 'predicted', average, warn_for)
