In [1]:
import os
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

In [2]:
javascript_path = "JavascriptSamples/"
python_path = "PythonSamples/"

In [3]:
file_types = {
              -1:"JavaScript",
               0: "Python",
            }

In [4]:
corpus = []
labels = []
file_types_and_labels = [(javascript_path, -1), (python_path, 0)]

In [5]:
for files_path, label in file_types_and_labels:
    files = os.listdir(files_path)
    for file in files:
        file_path = files_path + "/" + file
        try:
            with open(file_path, "r") as myfile:
                data = myfile.read().replace("\n", "")
        except:
            pass
        data = str(data)
        corpus.append(data)
        labels.append(label)

In [6]:
len(labels)

5306

In [7]:
len(corpus)

5306

In [8]:
corpus

["module.exports = {  plugins: [    require.resolve('babel-plugin-transform-function-bind'),    require.resolve('@babel/plugin-proposal-class-properties'),    require.resolve('@babel/plugin-proposal-object-rest-spread'),  ],  presets: [    [      require.resolve('@babel/preset-env'),      {        targets: {          node: '10',        },      },    ],  ],};",
 '\'use strict\';const Module = require(\'module\');const path = require(\'path\');const NodePlugin = require(\'./tools/node_modules/eslint-plugin-node-core\');NodePlugin.RULES_DIR = path.resolve(__dirname, \'tools\', \'eslint-rules\');const ModuleFindPath = Module._findPath;const hacks = [  \'eslint-plugin-node-core\',  \'eslint-plugin-markdown\',  \'babel-eslint\',];Module._findPath = (request, paths, isMain) => {  const r = ModuleFindPath(request, paths, isMain);  if (!r && hacks.includes(request)) {    try {      return require.resolve(`./tools/node_modules/${request}`);    // Keep the variable in place to ensure that ESLint 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.33, random_state=11)

In [10]:
model = Pipeline(
    [
        ("vect", HashingVectorizer(input="content", ngram_range=(1, 2))),
        ("tfidf", TfidfTransformer(use_idf=True,)),
        ("rf", RandomForestClassifier(class_weight="balanced")),
    ]
)

In [11]:
model.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 HashingVectorizer(alternate_sign=True, analyzer='word',
                                   binary=False, decode_error='strict',
                                   dtype=<class 'numpy.float64'>,
                                   encoding='utf-8', input='content',
                                   lowercase=True, n_features=1048576,
                                   ngram_range=(1, 2), norm='l2',
                                   preprocessor=None, stop_words=None,
                                   strip_accents=None,
                                   token_pattern='(?u)\\b\\w\\w+\\b',
                                   tokenizer=None...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight='balanced',
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
               

In [12]:
y_test_pred = model.predict(X_test)

In [13]:
y_test_pred[0:6]

array([-1, -1, -1, -1, -1,  0])

In [14]:
y_test[0:6]

[-1, -1, -1, -1, -1, 0]

In [15]:
print(accuracy_score(y_test, y_test_pred))

0.9914334665905197


In [16]:
print(confusion_matrix(y_test, y_test_pred))

[[1210    0]
 [  15  526]]


In [17]:
def predict_file_types(f):
    c = []
    with open(f,"r",errors='ignore') as myfile:
        data = myfile.read().replace("\n", "")
        data = str(data)
        c.append(data)
    a = model.predict(c)
    print("File Type = ",file_types.get(a[0]))

In [18]:
predict_file_types("a")

File Type =  Python


In [19]:
predict_file_types("b")

File Type =  JavaScript
