In [3]:
from modules.layout import layoutAnalysis
from modules.lexical import lexicalAnalysis
from modules.vectorizer import vectorizeFileInLines, vectorizeWholeFile
from modules.misc import getFilesPaths, printTestsPredicts
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import glob
import os
import sys

datasetPath = input("Type the path to the training files: ")
filesPath, filesLabels = getFilesPaths(datasetPath)


# vectorizeFileInLines vectorizes files and separates the lines so each file is represented as list[str] so the total set is list(vectorizedFilesInLines)[list(each file)[str(each line)]] => list[list[str]]
vectorizedFilesInLines = [vectorizeFileInLines(path) for path in filesPath]
# vectorizeFile vectorizes files and each file is represented as str so the total set is list(vectorizedFilesWhole)[str(each file)]
vectorizedFilesWhole = [vectorizeWholeFile(path) for path in filesPath]

# this generates the data sets for each type of analysis
filesLayoutData = layoutAnalysis(vectorizedFilesInLines)
filesLexicalData = lexicalAnalysis(vectorizedFilesWhole)

# this sums the two data sets so each file is represented as a list of features
filesData = np.concatenate((filesLayoutData, filesLexicalData), axis=1)


x_train, x_test, y_train, y_test = train_test_split(filesData, filesLabels, test_size=0.3)

# Geração do modelo por processo de machine learning com os dados dos arquivos conhecidos
model = RandomForestClassifier(n_estimators = 5000)
model.fit(x_train, y_train)

# Previsão sobre de quem o arquivo misterioso é
predictedAuthor = model.predict(x_test)
predictedAuthorProb = model.predict_proba(x_test)

# Verbose print
if "-v" in sys.argv:
    printTestsPredicts(y_test, predictedAuthor)

# Calculo da precisão do modelo
print(f'Accuracy: {accuracy_score(y_test, predictedAuthor)}')

print(predictedAuthorProb)

Type the path to the training files: C:\Users\andre\Documents\codeStylometryResearch\data\dataset-v3
Accuracy: 0.9333333333333333
[[7.800e-02 8.000e-02 4.580e-02 1.180e-02 6.520e-02 0.000e+00 5.540e-02
  6.560e-01 0.000e+00 7.800e-03]
 [0.000e+00 2.000e-04 2.000e-04 9.152e-01 2.600e-03 2.000e-03 8.200e-03
  4.000e-03 2.920e-02 3.840e-02]
 [0.000e+00 1.520e-02 5.740e-02 2.920e-02 8.600e-03 2.386e-01 7.400e-03
  7.600e-03 2.406e-01 3.954e-01]
 [2.400e-03 7.584e-01 7.800e-03 0.000e+00 1.026e-01 0.000e+00 7.600e-02
  5.240e-02 0.000e+00 4.000e-04]
 [2.000e-04 8.200e-02 2.000e-03 6.000e-04 8.450e-01 0.000e+00 1.260e-02
  4.860e-02 0.000e+00 9.000e-03]
 [3.600e-02 2.000e-04 0.000e+00 7.590e-01 0.000e+00 4.100e-02 1.600e-03
  1.400e-03 1.430e-01 1.780e-02]
 [5.000e-03 4.680e-02 5.200e-03 1.000e-03 8.400e-03 3.000e-03 8.366e-01
  8.920e-02 4.600e-03 2.000e-04]
 [9.796e-01 1.200e-03 2.000e-04 4.800e-03 6.000e-04 4.000e-04 5.800e-03
  2.600e-03 1.600e-03 3.200e-03]
 [6.000e-03 1.288e-01 6.456e-0