In [1]:
!pip install tensorflow scikit-learn pandas numpy




In [2]:
import pandas as pd

# Load dataset
data = pd.read_csv('/content/sample_bug_dataset.csv')

# Display the first few rows
print(data.head())

                       code_snippet          bug_type
0      for i in range(10): print(i)            No Bug
1                        x = 10 / 0  Division By Zero
2         if x = 5: print('x is 5')      Syntax Error
3  list = [1, 2, 3]\nprint(list[5])       Index Error
4    def func()\n    print('Hello')      Syntax Error


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Tokenize the code snippets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['code_snippet'])

# Convert text to sequences
X = tokenizer.texts_to_sequences(data['code_snippet'])
X = pad_sequences(X, padding='post')  # Ensure uniform input size

# Convert labels to numerical values
labels = {label: i for i, label in enumerate(data['bug_type'].unique())}
y = np.array([labels[label] for label in data['bug_type']])
y = to_categorical(y, num_classes=len(labels))  # One-hot encode the labels

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


X_train shape: (4, 7)
y_train shape: (4, 4)


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

# Define model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=X_train.shape[1]))  # Word embeddings
model.add(LSTM(128, return_sequences=False))  # LSTM layer
model.add(Dense(64, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))  # Output layer

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0000e+00 - loss: 1.3887 - val_accuracy: 0.0000e+00 - val_loss: 1.3908
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - accuracy: 0.7500 - loss: 1.3712 - val_accuracy: 0.0000e+00 - val_loss: 1.3996
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 1.0000 - loss: 1.3547 - val_accuracy: 0.0000e+00 - val_loss: 1.4121
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 1.0000 - loss: 1.3376 - val_accuracy: 0.0000e+00 - val_loss: 1.4294
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 1.0000 - loss: 1.3189 - val_accuracy: 0.0000e+00 - val_loss: 1.4488
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 1.0000 - loss: 1.2977 - val_accuracy: 0.0000e+00 - val_loss: 1.4700
Epoch 7/10
[1m1/1[0m [3

<keras.src.callbacks.history.History at 0x7c68cdb19510>

In [5]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0000e+00 - loss: 1.5976
Test Accuracy: 0.0000


In [6]:
def predict_bug_type(code_snippet):
    sequence = tokenizer.texts_to_sequences([code_snippet])
    sequence = pad_sequences(sequence, maxlen=X_train.shape[1], padding='post')
    prediction = model.predict(sequence)
    predicted_label = list(labels.keys())[np.argmax(prediction)]
    return predicted_label

# Test with a new code snippet
new_code = "x = 5\nif x = 10: print('x is 10')"  # Example with a syntax error
predicted_bug = predict_bug_type(new_code)
print(f"Predicted Bug Type: {predicted_bug}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step
Predicted Bug Type: Syntax Error


In [7]:
import pickle

# Save the model
model.save("bug_detector_model.h5")

# Save the tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save label mappings
with open("labels.pkl", "wb") as f:
    pickle.dump(labels, f)

print("Model and tokenizer saved successfully!")




Model and tokenizer saved successfully!


In [8]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model("bug_detector_model.h5")

# Load tokenizer
with open("tokenizer.pkl", "rb") as f:
    loaded_tokenizer = pickle.load(f)

# Load label mappings
with open("labels.pkl", "rb") as f:
    loaded_labels = pickle.load(f)

print("Model loaded successfully!")




Model loaded successfully!


In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pickle

# Load dataset
data = pd.read_csv('/content/sample_bug_dataset.csv')  # Make sure the path is correct

# Tokenize the code snippets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['code_snippet'])

# Convert text to sequences
X = tokenizer.texts_to_sequences(data['code_snippet'])
X = pad_sequences(X, padding='post')  # Ensure uniform input size

# Convert labels to numerical values
labels = {label: i for i, label in enumerate(data['bug_type'].unique())}
y = np.array([labels[label] for label in data['bug_type']])
y = to_categorical(y, num_classes=len(labels))  # One-hot encode the labels

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=X_train.shape[1]))  # Word embeddings
model.add(LSTM(128, return_sequences=False))  # LSTM layer
model.add(Dense(64, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))  # Output layer

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Save the model and tokenizer
model.save("bug_detector_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("labels.pkl", "wb") as f:
    pickle.dump(labels, f)

print("Model and tokenizer saved successfully!")

# Function to predict bug type for new code snippet
def predict_bug_type(code_snippet):
    sequence = tokenizer.texts_to_sequences([code_snippet])
    sequence = pad_sequences(sequence, maxlen=X_train.shape[1], padding='post')
    prediction = model.predict(sequence)
    predicted_label = list(labels.keys())[np.argmax(prediction)]
    return predicted_label

# Test with a new code snippet
new_code = "x = 5\nif x = 10: print('x is 10')"  # Example with a syntax error
predicted_bug = predict_bug_type(new_code)
print(f"Predicted Bug Type: {predicted_bug}")


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.2500 - loss: 1.3857 - val_accuracy: 0.0000e+00 - val_loss: 1.3950
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 1.0000 - loss: 1.3669 - val_accuracy: 0.0000e+00 - val_loss: 1.4033
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 1.0000 - loss: 1.3512 - val_accuracy: 0.0000e+00 - val_loss: 1.4120
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 1.0000 - loss: 1.3345 - val_accuracy: 0.0000e+00 - val_loss: 1.4234
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 1.0000 - loss: 1.3164 - val_accuracy: 0.0000e+00 - val_loss: 1.4353
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 1.0000 - loss: 1.2961 - val_accuracy: 0.0000e+00 - val_loss: 1.4482
Epoch 7/10
[1m1/1[0m [32m━━



Test Accuracy: 0.0000
Model and tokenizer saved successfully!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
Predicted Bug Type: Syntax Error
