# **Import libraries**

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import *
from sklearn import svm
from sklearn.metrics import classification_report, mean_squared_error, r2_score


print('Libraries imported.')

Libraries imported.


# **Read the CSV from Google drive**

Store the dataset on a Google drive's folder to avoid to upload it every time open colab.

1. Mount the Google drive giving permission; 
2. Insert the files' path and read the data.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dataset path
path = "/content/drive/MyDrive/data/midterm_1/"
blind_dt = pd.read_csv(path+'blind_test.csv', sep='\t')
print('File %s loaded: %d samples.' %( 'blind_test', len(blind_dt.instructions) ))

# Dataset to train model
mapping_dt = pd.read_csv(path+'mapping_traces_O0.csv', sep='\t')
print('File %s loaded: %d samples.' %( 'mapping_traces_O0', len(blind_dt.instructions) ))
#print(mapping_dt)

# show an example
#id = random.randrange(0,blind_dt.shape[0])
#print("\nExample ID: %d \n%s" %(id, blind_dt.iloc[id]))

#id = random.randrange(0, mapping_dt.shape[0])
#print('\nExampe ID: %d \n%s' %(id, mapping_dt.iloc[id]))

File blind_test loaded: 10000 samples.
File mapping_traces_O0 loaded: 10000 samples.


**Join data about instructions and source_line**

In this case, create the two dataset:
- X and Y to use it in training process.

X is a join of **instructions** and **source_line** and Y is a **set of output**.

In [None]:
# Delete columns for mapping_trace_O0 file and create Y output
mapping_dt_y = mapping_dt['bug']

# Create a series for the join of 'instructions' and 'source_line'
mapping_dt_x = (mapping_dt['instructions'] + mapping_dt['source_line'])

print("Series created.")

Series created.


# **Defining Vectorizer for text**


In [None]:
vectorizer_type = "tfid" # "count" or "tfid"

if vectorizer_type == "count":
  vectorizer = CountVectorizer() # multinomial
elif vectorizer_type == "tfid":
  vectorizer = TfidfVectorizer()

X_all = vectorizer.fit_transform(mapping_dt_x)

# Show feature extracted
#print(vectorizer.get_feature_names())
y_all = mapping_dt_y

print(X_all.shape)
print(y_all.shape)



(100000, 1235)
(100000,)


# Split data


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
          test_size=0.20, random_state=16)

print("Train: %d - Test: %d" %(X_train.shape[0],X_test.shape[0]))


Train: 80000 - Test: 20000


# Create and fit model

In [None]:
from datetime import datetime

start_time = datetime.now()
model_type_1 = BernoulliNB().fit(X_train, y_train)
print('Bernoulli Model created')
end_time = datetime.now()
print('Duration to fit the model: {}\n'.format(end_time - start_time))

start_time = datetime.now()
model_type_2 = MultinomialNB().fit(X_train, y_train)
print('Multinomial Model created')
end_time = datetime.now()
print('Duration to fit the model: {}\n'.format(end_time - start_time))

start_time = datetime.now()
model_type_3 = svm.LinearSVC().fit(X_train, y_train)
print('Support Vector Machine Model created')
end_time = datetime.now()
print('Duration to fit the model: {}\n'.format(end_time - start_time))



Bernoulli Model created
Duration to fit the model: 0:00:00.039199

Multinomial Model created
Duration to fit the model: 0:00:00.024033

Support Vector Machine Model created
Duration to fit the model: 0:00:01.212852



# **Evaluation**

In [None]:
start_time = datetime.now()

print('Bernoulli model: ')
y_pred_1 = model_type_1.predict(X_test)
acc_1 = model_type_1.score(X_test, y_test)
end_time = datetime.now()
print('Duration to predict: {}'.format(end_time - start_time))

print(classification_report(y_test, y_pred_1))
print("Accuracy %.3f" %acc_1)
print('#####################################################\n')
# -------------------
print('\nMultinomial model: ')
y_pred_2 = model_type_2.predict(X_test)
acc_2 = model_type_2.score(X_test, y_test)    

print(classification_report(y_test, y_pred_2))
print("Accuracy %.3f" %acc_2)

print('#####################################################\n')
# -------------------
print('Support Vector Machine model: ')
y_pred_3 = model_type_3.predict(X_test)
acc_3 = model_type_3.score(X_test, y_test)    
print("Accuracy %.3f" %acc_3)

Bernoulli model: 
Duration to predict: 0:00:00.017670
              precision    recall  f1-score   support

           0       0.65      0.87      0.75      9969
           1       0.81      0.54      0.65     10031

    accuracy                           0.71     20000
   macro avg       0.73      0.71      0.70     20000
weighted avg       0.73      0.71      0.70     20000

Accuracy 0.706
#####################################################


Multinomial model: 
              precision    recall  f1-score   support

           0       0.53      0.72      0.61      9969
           1       0.57      0.37      0.45     10031

    accuracy                           0.54     20000
   macro avg       0.55      0.55      0.53     20000
weighted avg       0.55      0.54      0.53     20000

Accuracy 0.544
#####################################################

Support Vector Machine model: 
Accuracy 0.797


# **Prediction and writing txt file**

In [None]:
# Delete useless columns
blind_dt_x = blind_dt.drop(columns=['line_number', 'function_name', 'program'])

with open('1805370.txt', 'w') as f:
  start_time = datetime.now()
  for i in range(blind_dt_x.shape[0]):
    xnew1 = vectorizer.transform(blind_dt_x.iloc[i])
    ynew1 = model_type_3.predict(xnew1)
    f.write(str(ynew1[0]))
    f.write('\n')

  end_time = datetime.now()
  print('Duration to predict the value and write on file: {}'.format(end_time - start_time))
  f.close
  print('File .txt wrote.')


Duration to predict the value and write on file: 0:00:10.243975
File .txt wrote.
