In [3]:
#Begin by importing the necessary libraries and specifying the paths of the
#samples we will be using to train and test:

import os
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
javascript_path = "JavascriptSamples/"
python_path = "PythonSamples/"
powershell_path = "PowerShellSamples/"

Next, we read in all of the file types. We also create an array of labels with -1, 0,
and 1 representing the JavaScript, Python, and PowerShell scripts, respectively:


In [16]:
corpus = []
labels = []
file_types_and_labels = [(javascript_path, -1), (python_path, 0),
(powershell_path, 1)]
for files_path, label in file_types_and_labels:
    files = os.listdir(files_path)
    for file in files:
        file_path = files_path + "/" + file
        try:
            with open(file_path, "r") as myfile:
                data = myfile.read().replace("\n", "")
        except:
            pass
        data = str(data)
        corpus.append(data)
        labels.append(label)

JavascriptSamples/
PythonSamples/
PowerShellSamples/


We go on to create a train-test split and a pipeline that will perform basic NLP on
the files, followed by a random forest classifier:

In [17]:
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.33, random_state=11)
text_clf = Pipeline(
    [
    ("vect", HashingVectorizer(input="content", ngram_range=(1,3))),
    ("tfidf", TfidfTransformer(use_idf=True,)),
    ("rf", RandomForestClassifier(class_weight="balanced")),
    ]
)

text_clf.fit(X_train, y_train)
y_test_pred = text_clf.predict(X_test)
print(accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

0.9914432401597262
[[1221    1    0]
 [  14  516    0]
 [   0    0    1]]
