In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install mabwiser




In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Load the dataset
dataset_path = "/content/drive/MyDrive/RiskAssessment/spring.csv"
data = pd.read_csv(dataset_path)

# Preprocess the dataset
# Fill missing values in 'description' with empty strings
data['description'] = data['description'].fillna("")

# Combine 'title' and 'description' into a new column 'text'
data['text'] = data['title'] + " " + data['description']

# Drop unnecessary columns
data = data.drop(columns=['issuekey', 'title', 'description'])

# Handle rare classes in 'storypoint'
# Group rare story points into a single category labeled -1
storypoint_counts = data['storypoint'].value_counts()
rare_classes = storypoint_counts[storypoint_counts < 20].index
data['storypoint'] = data['storypoint'].apply(lambda x: -1 if x in rare_classes else x)

# Check the distribution of the updated 'storypoint'
print("Updated Storypoint Distribution:")
print(data['storypoint'].value_counts())

# Feature transformation using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X = tfidf.fit_transform(data['text'])

# Target variable
y = data['storypoint']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)

# RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_distributions,
    n_iter=20,  # Perform 20 random searches
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Extract the best hyperparameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Make predictions with the best model
y_pred_initial = best_model.predict(X_test)

# Calculate initial accuracy
initial_accuracy = accuracy_score(y_test, y_pred_initial)
print("Best Parameters:", best_params)
print(f"Initial Accuracy: {initial_accuracy * 100:.2f}%")


Updated Storypoint Distribution:
storypoint
 1     818
 3     755
 2     643
 5     565
 8     397
 4     211
-1      66
 10     44
 6      27
Name: count, dtype: int64
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}
Initial Accuracy: 32.72%


In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from mabwiser.mab import LearningPolicy, MAB

# Load the dataset
dataset_path = "/content/drive/MyDrive/RiskAssessment/spring.csv"
data = pd.read_csv(dataset_path)

# Fill missing values in 'description' by reassigning to the column instead of using inplace=True
data['description'] = data['description'].fillna("")

# Set 'storypoint' as the target variable
X = data.drop(columns=['storypoint'])
y = data['storypoint']

# Text Preprocessing using TF-IDF for 'title' and 'description' columns
column_transformer = ColumnTransformer([
    ('tfidf_title', TfidfVectorizer(), 'title'),
    ('tfidf_description', TfidfVectorizer(), 'description')
], remainder='drop')

X_transformed = column_transformer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Initial Predictions
initial_predictions = clf.predict(X_test)

# Evaluate initial model accuracy
initial_accuracy = accuracy_score(y_test, initial_predictions)
print(f"Initial Prediction Accuracy: {initial_accuracy * 100:.2f}%")

# Define possible actions (adjustments to story points)
actions = [-1, 0, 1]

# Convert actual outcomes to a numpy array for compatibility
actual_outcomes = y_test.to_numpy()

# Calculate rewards based on how close the initial prediction is to the actual outcome
# Reward of 1 if within 1 point of the actual outcome, otherwise -1
rewards = [1 if abs(pred - actual) <= 1 else -1 for pred, actual in zip(initial_predictions, actual_outcomes)]

# Set up Multi-Armed Bandit model with epsilon-greedy policy
mab = MAB(arms=actions, learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0.1))

# Fit the MAB model on the initial predictions and rewards
mab.fit(decisions=initial_predictions, rewards=rewards)

# Refine predictions using MAB - make adjustments based on the bandit's recommendations
refined_predictions = []
for pred in initial_predictions:
    # Get the adjustment recommendation from the MAB model (pass a 2D array as context)
    adjustment = mab.predict([[pred]])  
    # Apply the adjustment to the initial prediction
    refined_predictions.append(pred + adjustment)

# Convert refined predictions to numpy array for accuracy calculation
refined_predictions = np.array(refined_predictions)

# Evaluate the refined predictions
refined_accuracy = accuracy_score(actual_outcomes, refined_predictions)
print(f"Refined Prediction Accuracy: {refined_accuracy * 100:.2f}%")


Initial Prediction Accuracy: 33.00%
Refined Prediction Accuracy: 14.16%


In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Load the dataset
dataset_path = "/content/drive/MyDrive/RiskAssessment/spring.csv"
data = pd.read_csv(dataset_path)

# Preprocess the dataset
data['description'] = data['description'].fillna("")
data['text'] = data['title'] + " " + data['description']
data = data.drop(columns=['issuekey', 'title', 'description'])

# Handle rare classes in 'storypoint'
storypoint_counts = data['storypoint'].value_counts()
rare_classes = storypoint_counts[storypoint_counts < 20].index
data['storypoint'] = data['storypoint'].apply(lambda x: -1 if x in rare_classes else x)

# Map storypoint labels to consecutive integers
unique_storypoints = sorted(data['storypoint'].unique())
label_mapping = {label: idx for idx, label in enumerate(unique_storypoints)}
inverse_mapping = {idx: label for label, idx in label_mapping.items()}
data['storypoint'] = data['storypoint'].map(label_mapping)

# Feature transformation using TF-IDF with trigrams and increased features
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), stop_words='english')
X = tfidf.fit_transform(data['text'])

# Target variable
y = data['storypoint']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Hyperparameter tuning for Random Forest
rf_param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')
random_search_rf = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=rf_param_distributions,
    n_iter=20,
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
random_search_rf.fit(X_train, y_train)
best_rf_model = random_search_rf.best_estimator_

# Train XGBoost Classifier
xgb_clf = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
xgb_clf.fit(X_train, y_train)

# Ensemble Model: Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf_model),
    ('xgb', xgb_clf)
], voting='soft')
voting_clf.fit(X_train, y_train)

# Evaluate Models
models = {
    'Random Forest': best_rf_model,
    'XGBoost': xgb_clf,
    'Ensemble': voting_clf
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_original = [inverse_mapping[pred] for pred in y_pred]  
    y_test_original = [inverse_mapping[true] for true in y_test]  
    accuracy = accuracy_score(y_test_original, y_pred_original)
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    print(classification_report(y_test_original, y_pred_original))


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random Forest Accuracy: 75.02%
              precision    recall  f1-score   support

          -1       0.97      0.99      0.98       164
           1       0.44      0.60      0.51       163
           2       0.59      0.52      0.55       164
           3       0.47      0.32      0.38       164
           4       0.89      0.91      0.90       164
           5       0.71      0.55      0.62       163
           6       0.99      1.00      0.99       164
           8       0.69      0.86      0.77       163
          10       1.00      0.99      0.99       164

    accuracy                           0.75      1473
   macro avg       0.75      0.75      0.74      1473
weighted avg       0.75      0.75      0.74      1473

XGBoost Accuracy: 70.40%
              precision    recall  f1-score   support

          -1       0.99      0.95      0.97       164
           1       0.39      0.45      0.42       163
           2       0.52      0.48      0.50       164
           3       0.3

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay

# Dictionary to store accuracy scores
accuracy_scores = {}

# Collect accuracy scores for each model
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_original = [inverse_mapping[pred] for pred in y_pred]
    y_test_original = [inverse_mapping[true] for true in y_test]
    accuracy = accuracy_score(y_test_original, y_pred_original)
    accuracy_scores[model_name] = accuracy * 100  # Store percentage value

# Visualization: Accuracy Scores
plt.figure(figsize=(8, 6))
sns.barplot(x=list(accuracy_scores.keys()), y=list(accuracy_scores.values()), palette="viridis")
plt.title("Accuracy Scores of Models")
plt.ylabel("Accuracy (%)")
plt.xlabel("Models")
plt.ylim(0, 100)
plt.show()


best_model_name = max(accuracy_scores, key=accuracy_scores.get) 
best_model = models[best_model_name]
y_pred_best = best_model.predict(X_test)
y_pred_original_best = [inverse_mapping[pred] for pred in y_pred_best]
y_test_original_best = [inverse_mapping[true] for true in y_test]

# Confusion Matrix Display
plt.figure(figsize=(10, 8))
ConfusionMatrixDisplay.from_predictions(
    y_test_original_best,
    y_pred_original_best,
    display_labels=sorted(data['storypoint'].unique()),
    cmap="Blues",
    xticks_rotation=45
)
plt.title(f"Confusion Matrix: {best_model_name}")
plt.show()

# Classification Report Metrics
from sklearn.metrics import precision_recall_fscore_support

# Collect metrics
precision, recall, f1, _ = precision_recall_fscore_support(
    y_test_original_best, y_pred_original_best, average=None, zero_division=0
)
labels = sorted(data['storypoint'].unique())

# Plot Precision, Recall, and F1-score
metrics = pd.DataFrame({
    "Label": labels,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1
})

metrics_melted = metrics.melt(id_vars="Label", var_name="Metric", value_name="Score")

plt.figure(figsize=(12, 6))
sns.barplot(data=metrics_melted, x="Label", y="Score", hue="Metric", palette="coolwarm")
plt.title(f"Metrics by Label for {best_model_name}")
plt.ylabel("Score")
plt.xlabel("Labels")
plt.ylim(0, 1)
plt.legend(loc="upper right")
plt.show()



NameError: name 'models' is not defined

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from mabwiser.mab import LearningPolicy, NeighborhoodPolicy, MAB
import pandas as pd
import numpy as np

# Load the dataset
dataset_path = "/content/drive/MyDrive/RiskAssessment/spring.csv"
data = pd.read_csv(dataset_path)

# Preprocess the dataset
data['description'] = data['description'].fillna("")
data['text'] = data['title'] + " " + data['description']
data = data.drop(columns=['issuekey', 'title', 'description'])

# Handle rare classes in 'storypoint'
storypoint_counts = data['storypoint'].value_counts()
rare_classes = storypoint_counts[storypoint_counts < 20].index
data['storypoint'] = data['storypoint'].apply(lambda x: -1 if x in rare_classes else x)

# Feature transformation using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X = tfidf.fit_transform(data['text'])

# Map storypoint labels to consecutive integers
unique_storypoints = sorted(data['storypoint'].unique())
label_mapping = {label: idx for idx, label in enumerate(unique_storypoints)}
inverse_mapping = {idx: label for label, idx in label_mapping.items()}
data['storypoint'] = data['storypoint'].map(label_mapping)

# Target variable
y = data['storypoint']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Train a supervised model (Random Forest Classifier)
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=200)
rf_clf.fit(X_train, y_train)

# Make initial predictions using the supervised model
initial_predictions = rf_clf.predict(X_test)

# Map predictions back to original storypoint labels
initial_predictions_original = [inverse_mapping[pred] for pred in initial_predictions]
y_test_original = [inverse_mapping[true] for true in y_test]

# Evaluate initial predictions
initial_accuracy = accuracy_score(y_test_original, initial_predictions_original)
print(f"Initial Prediction Accuracy (Random Forest): {initial_accuracy * 100:.2f}%")

# Define MAB actions (adjustments to storypoints)
actions = [-1, 0, 1]

# Calculate rewards for initial predictions
rewards = [
    1 if abs(pred - true) <= 1 else -1
    for pred, true in zip(initial_predictions_original, y_test_original)
]

# Set up MAB with epsilon-greedy policy
mab = MAB(
    arms=actions,
    learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0.1)
)

# Train MAB on initial predictions and rewards
mab.fit(decisions=initial_predictions, rewards=rewards)

# Refine predictions using MAB
refined_predictions = []
for pred in initial_predictions:
    # Get adjustment recommendation from MAB
    # adjustment = mab.predict([[pred]])[0]
    adjustment = mab.predict([[pred]])

    refined_predictions.append(pred + adjustment)

# Map refined predictions back to original storypoint labels
refined_predictions_original = [inverse_mapping.get(pred, pred) for pred in refined_predictions]

# Evaluate refined predictions
refined_accuracy = accuracy_score(y_test_original, refined_predictions_original)
print(f"Refined Prediction Accuracy (Hybrid Model): {refined_accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report for Hybrid Model:")
print(classification_report(y_test_original, refined_predictions_original))


Initial Prediction Accuracy (Random Forest): 75.90%
Refined Prediction Accuracy (Hybrid Model): 71.15%

Classification Report for Hybrid Model:
              precision    recall  f1-score   support

          -1       0.94      0.95      0.95       164
           1       0.42      0.53      0.47       163
           2       0.51      0.54      0.53       164
           3       0.45      0.31      0.37       164
           4       0.85      0.87      0.86       164
           5       0.65      0.52      0.58       163
           6       0.92      0.98      0.95       164
           8       0.69      0.78      0.73       163
           9       0.00      0.00      0.00         0
          10       0.97      0.93      0.95       164

    accuracy                           0.71      1473
   macro avg       0.64      0.64      0.64      1473
weighted avg       0.71      0.71      0.71      1473



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np




inverse_mapping = {
    0: 1,
    1: 3,
    2: 2,
    3: 5,
    4: 8,
    5: 4,
    6: -1,
    7: 10,
    8: 6
}

# Define the function to predict storypoints
def predict_storypoint(title, description):
    """
    Predict the storypoint for a given task using the pre-trained Random Forest model.

    :param title: Title of the task
    :param description: Description of the task
    :return: Adjusted predicted storypoint
    """
    # Combine title and description into a single text field
    text = title + " " + description

    # Transform the input text using TF-IDF vectorizer
    input_tfidf = tfidf.transform([text])

    # Predict the initial storypoint using the Random Forest model
    initial_prediction = rf_clf.predict(input_tfidf)[0]

    # Map the initial prediction back to the original storypoint using the inverse mapping
    predicted_storypoint = inverse_mapping.get(initial_prediction, initial_prediction)

    return predicted_storypoint


title_input = "Design for deploying XD on EC2"
description_input = "Create enough of a design to develop additional stories."


predicted_storypoint = predict_storypoint(title_input, description_input)

# Output the result
print(f"Predicted Storypoint for '{title_input}' - '{description_input}': {predicted_storypoint}")

Predicted Storypoint for 'Design for deploying XD on EC2' - 'Create enough of a design to develop additional stories.': 5


In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from mabwiser.mab import LearningPolicy, MAB
import numpy as np



inverse_mapping = {
    0: 1,
    1: 3,
    2: 2,
    3: 5,
    4: 8,
    5: 4,
    6: -1,
    7: 10,
    8: 6
}

# Define the function to predict story points using refined prediction
def predict_storypoint_refined(title, description):
    """
    Predict the story point for a given task using a combination of a Random Forest classifier
    and Multi-Armed Bandit (MAB) adjustments.

    :param title: Title of the task
    :param description: Description of the task
    :return: Refined predicted story point
    """
    # Combine title and description into a single text field
    text = title + " " + description

    # Transform the input text using the TF-IDF vectorizer
    input_tfidf = tfidf.transform([text])

    # Predict the initial story point using the Random Forest model
    initial_prediction = rf_clf.predict(input_tfidf)[0]

    # Get the MAB adjustment for the initial prediction
    adjustment = mab.predict([[initial_prediction]])

    # Apply the adjustment to refine the prediction
    refined_prediction = initial_prediction + adjustment

    # Map the refined prediction back to the original story point using the inverse mapping
    predicted_storypoint = inverse_mapping.get(refined_prediction, refined_prediction)

    return predicted_storypoint


# Example Inputs
title_input = "Design for deploying XD on EC2"
description_input = "Create enough of a design to develop additional stories."

# Predict refined story point
predicted_storypoint = predict_storypoint_refined(title_input, description_input)

# Output the result
print(f"Predicted Storypoint for '{title_input}' - '{description_input}': {predicted_storypoint}")


Predicted Storypoint for 'Design for deploying XD on EC2' - 'Create enough of a design to develop additional stories.': 5
