In [1]:
import json
import os
import glob
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
def recursive_items(tupl):
    for value in tupl:
        if type(value) is tuple: 
            if value[0] != 'launch' and value[0] != 'comment':
                yield from recursive_items(value)
        else:
            yield value

In [3]:
os.chdir(r'C:\Users\datta\AppData\Local\UsageProbeLogs\TrainingData\OpeningChrome')
files1 = glob.glob('*.json')

In [4]:
data = pd.DataFrame([])
for file in files1:
    with open(file, "r") as upoLogFile:
        log_str = ""
        logdata = json.load(upoLogFile, object_pairs_hook=tuple)
        for value in recursive_items(logdata):
            if type(value) == str :
                log_str = log_str + value + " "
            elif type(value) == int:
                log_str = log_str + str(value) + " "
            elif type(value) == bool:
                log_str = log_str + str(value) + " "
            elif type(value) == list:
                log_str = log_str + " ".join(value) + " "
        data = data.append(pd.DataFrame({'log' :  log_str, 'label': "Opening Chrome"}, index=[0]), ignore_index=True)


In [5]:
os.chdir(r'C:\Users\datta\AppData\Local\UsageProbeLogs\TrainingData\OpeningEclipse')
files2 = glob.glob('*.json')

In [6]:
for file in files2:
    with open(file, "r") as upoLogFile:
        #log_str = log_str + "\n" + file + "\n"
        log_str = ""
        logdata = json.load(upoLogFile, object_pairs_hook=tuple)
        for value in recursive_items(logdata):
            if type(value) == str :
                log_str = log_str + value + " "
            elif type(value) == int:
                log_str = log_str + str(value) + " "
            elif type(value) == bool:
                log_str = log_str + str(value) + " "
            elif type(value) == list:
                log_str = log_str + " ".join(value) + " "
        log_str = log_str + "\n \n"
        #print(log_str)
        data = data.append(pd.DataFrame({'log' :  log_str, 'label': "Opening Eclipse"}, index=[0]), ignore_index=True)

In [7]:
data = data.sample(frac=1).reset_index(drop=True)

In [8]:
data

Unnamed: 0,log,label
0,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome
1,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome
2,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome
3,activity probe FILERESOURCE_PROBE status Creat...,Opening Eclipse
4,activity probe PROCESS_PROBE status ProcessCre...,Opening Eclipse
...,...,...
82,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome
83,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome
84,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome
85,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome


In [9]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

In [10]:
data['cleaned'] = data['log'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [11]:
data

Unnamed: 0,log,label,cleaned
0,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome,activ probe process probe status processcr pro...
1,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome,activ probe process probe status processcr pro...
2,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome,activ probe process probe status processcr pro...
3,activity probe FILERESOURCE_PROBE status Creat...,Opening Eclipse,activ probe fileresourc probe status creat pat...
4,activity probe PROCESS_PROBE status ProcessCre...,Opening Eclipse,activ probe process probe status processcr pro...
...,...,...,...
82,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome,activ probe process probe status processcr pro...
83,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome,activ probe process probe status processcr pro...
84,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome,activ probe process probe status processcr pro...
85,activity probe PROCESS_PROBE status ProcessCre...,Opening Chrome,activ probe process probe status processcr pro...


In [12]:
os.chdir(r'C:\Users\datta\AppData\Local\UsageProbeLogs')

In [13]:
compression_opts = dict(method='zip', archive_name='out.csv')
data.to_csv('out.zip', index=False, compression=compression_opts)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned'], data.label, test_size=0.2)

In [15]:
data_train = pd.DataFrame({'training_data': X_train, 'label': y_train})

In [16]:
data_train = data_train.reset_index()

In [17]:
data_train

Unnamed: 0,index,training_data,label
0,49,activ probe fileresourc probe status creat pat...,Opening Eclipse
1,79,activ probe process probe status processcr pro...,Opening Chrome
2,35,activ probe process probe status processdelet ...,Opening Chrome
3,47,activ probe fileresourc probe status creat pat...,Opening Eclipse
4,26,activ probe process probe status processcr pro...,Opening Chrome
...,...,...,...
64,62,activ probe process probe status processcr pro...,Opening Chrome
65,43,activ probe process probe status processcr pro...,Opening Chrome
66,38,activ probe process probe status processdelet ...,Opening Eclipse
67,24,activ probe fileresourc probe status creat pat...,Opening Eclipse


In [18]:
data_test = pd.DataFrame({'testing_data': X_test, 'label': y_test})

In [19]:
data_test = data_test.reset_index()

In [20]:
data_test

Unnamed: 0,index,testing_data,label
0,68,activ probe process probe status processcr pro...,Opening Chrome
1,83,activ probe process probe status processcr pro...,Opening Chrome
2,72,activ probe process probe status processcr pro...,Opening Eclipse
3,3,activ probe fileresourc probe status creat pat...,Opening Eclipse
4,34,activ probe process probe status processcr pro...,Opening Chrome
5,17,activ probe process probe status processcr pro...,Opening Chrome
6,23,activ probe fileresourc probe status chang pat...,Opening Eclipse
7,85,activ probe process probe status processcr pro...,Opening Chrome
8,7,activ probe process probe status processcr pro...,Opening Chrome
9,22,activ probe fileresourc probe status creat pat...,Opening Eclipse


In [21]:
pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 3), stop_words="english", sublinear_tf=True)),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))])

In [22]:
model = pipeline.fit(data_train['training_data'], data_train['label'])

In [23]:
vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

In [24]:
data_test['predicted'] = model.predict(data_test['testing_data'])

In [25]:
data_test

Unnamed: 0,index,testing_data,label,predicted
0,68,activ probe process probe status processcr pro...,Opening Chrome,Opening Chrome
1,83,activ probe process probe status processcr pro...,Opening Chrome,Opening Chrome
2,72,activ probe process probe status processcr pro...,Opening Eclipse,Opening Eclipse
3,3,activ probe fileresourc probe status creat pat...,Opening Eclipse,Opening Eclipse
4,34,activ probe process probe status processcr pro...,Opening Chrome,Opening Chrome
5,17,activ probe process probe status processcr pro...,Opening Chrome,Opening Chrome
6,23,activ probe fileresourc probe status chang pat...,Opening Eclipse,Opening Eclipse
7,85,activ probe process probe status processcr pro...,Opening Chrome,Opening Chrome
8,7,activ probe process probe status processcr pro...,Opening Chrome,Opening Chrome
9,22,activ probe fileresourc probe status creat pat...,Opening Eclipse,Opening Eclipse


In [26]:
print("accuracy score: " + str(model.score(data_test['testing_data'], data_test['label'])))

accuracy score: 1.0
