<a href="https://colab.research.google.com/github/NHleza/Week--4/blob/main/Week4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Week 4 Assignment: AI in Software Engineering
# Theme: "Building Intelligent Software Solutions" 💻🤖

# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
import matplotlib.pyplot as plt

# -----------------------------
# Part 1: Automated Bug Detection (Classification)
# -----------------------------

# Synthetic dataset simulating code metrics and bug presence
# Features: lines_of_code, cyclomatic_complexity, num_functions, num_comments
# Target: bug_present (0 = no bug, 1 = bug)
data = {
    'lines_of_code': [100, 250, 150, 300, 400, 120, 500, 350, 200, 450],
    'cyclomatic_complexity': [10, 25, 15, 30, 40, 12, 50, 35, 20, 45],
    'num_functions': [5, 12, 7, 15, 20, 6, 25, 18, 10, 22],
    'num_comments': [20, 30, 25, 35, 40, 22, 45, 38, 28, 42],
    'bug_present': [0, 1, 0, 1, 1, 0, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

# Split dataset
X = df.drop('bug_present', axis=1)
y = df['bug_present']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Bug Detection Classification Report:\n")
print(classification_report(y_test, y_pred))

# -----------------------------
# Part 2: Code Optimization Suggestion (Regression)
# -----------------------------

# Synthetic dataset simulating code metrics and execution time (ms)
data_opt = {
    'lines_of_code': [100, 250, 150, 300, 400, 120, 500, 350, 200, 450],
    'cyclomatic_complexity': [10, 25, 15, 30, 40, 12, 50, 35, 20, 45],
    'num_functions': [5, 12, 7, 15, 20, 6, 25, 18, 10, 22],
    'execution_time_ms': [120, 350, 180, 400, 500, 130, 600, 450, 200, 550]
}

df_opt = pd.DataFrame(data_opt)

# Features and target
X_opt = df_opt.drop('execution_time_ms', axis=1)
y_opt = df_opt['execution_time_ms']

# Split dataset
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(X_opt, y_opt, test_size=0.3, random_state=42)

# Train Random Forest Regressor
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_opt, y_train_opt)

# Predict and evaluate
y_pred_opt = reg.predict(X_test_opt)
mse = mean_squared_error(y_test_opt, y_pred_opt)
print(f"\nCode Execution Time Prediction MSE: {mse:.2f}")

# Plot actual vs predicted execution times
plt.scatter(y_test_opt, y_pred_opt)
plt.xlabel("Actual Execution Time (ms)")
plt.ylabel("Predicted Execution Time (ms)")
plt.title("Code Execution Time Prediction")
plt.plot([min(y_test_opt), max(y_test_opt)], [min(y_test_opt), max(y_test_opt)], 'r--')
plt.show()

# -----------------------------
# Part 3: NLP for Code Comment Summarization using spaCy
# -----------------------------

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# code comment
comment = """
This function calculates the factorial of a number recursively.
It multiplies the number by the factorial of the number minus one until it reaches one.
"""

# Process comment
doc = nlp(comment)

# Extract noun chunks and verbs as a simple summary heuristic
nouns = [chunk.text for chunk in doc.noun_chunks if chunk.text.lower() not in STOP_WORDS]
verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]

summary = "Summary: " + ", ".join(nouns) + ". Key actions: " + ", ".join(set(verbs)) + "."

print("\nCode Comment Summary:")
print(summary)

# Visualize named entities if any (for demonstration)
displacy.render(doc, style="ent")

# -----------------------------
# Ethical Reflection (to include in report)
# -----------------------------

"""
Ethical Reflection:

- Automated bug detection and code optimization models must be trained on diverse, representative datasets to avoid bias toward certain coding styles or languages.
- Transparency in AI decision-making is essential, especially when AI suggests code changes that impact software behavior.
- Privacy of proprietary codebases must be maintained when using AI tools, ensuring data is securely handled.
- AI should augment developers’ expertise, not replace human judgment, to maintain software quality and accountability.
- Continuous monitoring and validation of AI models are required to prevent performance degradation and unintended consequences.
"""

# End of notebook
