In [3]:
# -*- coding: utf-8 -*-
'''
Feature Engineering Overview
1. import intermediate, dynamic features dataframe (tweet corpus)
2. configure and generate count vector
3. create term frequency matrix by vectorizing tweet corpus, applying lemmatization, stemming etc.
4. generate dynamic features using term frequency matrix (GOSS, LOSS, document-topic distribution entropy)
5. export dynamic features dataframe
'''
import time
import sys
sys.path.insert(0, 'C:/Users/KARTHIK/Desktop/spam detection using clusering and classification/code/scripts/feature_engineering')
import nlp_vector_config
import dynamic_features
from dynamic_features import generate_dynamic_features
from nlp_vector_config import generate_vector, vectorize
sys.path.append('../util/.')
import util
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# define import/export directories
dirs = {'dynamic_import': 'dynamic_features_intermediate.csv',
        'dynamic_export': 'dynamic_features.csv',
        'param_import': 'hp_fe_config.json'}


def main():
    st = time.time()
    print('\nGenerating dynamic features for honeypot dataset..\n')
    params = util.parse_params(dirs['param_import'], 'Feature Engineering')

    # 1.import dataframe
    df = util.import_frame(dirs['dynamic_import'])

    # 2.configure count vector
    cv = generate_vector(params['count_vector'])

    # 3. vectorize corpus, create term frequency matrix
    tf_matrix, tf_feature_names = vectorize(cv, df)

    # 4.generate dynamic features
    df = generate_dynamic_features(tf_matrix, params['lda_modelling'])

    # 5.export dataframe
    util.export_frame(df, dirs['dynamic_export'])

    et = time.time() - st
    print('\nDynamic feature generation completed in {0} seconds. Features saved to:\n {1}'.format(
        et, dirs['dynamic_export']))

if __name__ == '__main__':
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KARTHIK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KARTHIK\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KARTHIK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Generating dynamic features for honeypot dataset..

Executing Feature Engineering with the following params:
 {
    "count_vector": {
        "max_doc_frequency": 0.85, 
        "min_doc_frequency": 0.15, 
        "ngram_range": [
            1, 
            2
        ]
    }, 
    "lda_modelling": {
        "iterations": 5, 
        "lda_topics": 5
    }
}

Dynamic feature generation completed in 4183.26900005 seconds. Features saved to:
 dynamic_features.csv
