<a href="https://colab.research.google.com/github/MalikaIT21277122/TimeSeriesAnalysis/blob/main/Defect_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Step 1: Load the datasets with limited rows
nrows_limit = 5000  # Increase limit to debug
defect_facts = pd.read_csv("CESAW_defect_facts.csv", usecols=['project_key', 'defect_fix_count'], nrows=nrows_limit)
task_facts = pd.read_csv("CESAW_task_fact.csv", usecols=['project_key', 'team_key', 'task_plan_time_minutes', 'task_actual_time_minutes'], nrows=nrows_limit)
size_facts = pd.read_csv("CESAW_size_facts.csv", usecols=['project_key', 'size_added_and_modified', 'size_deleted'], nrows=nrows_limit)

# Step 2: Merge relevant datasets with 'outer' to debug mismatches
merged_data = pd.merge(defect_facts, task_facts, on="project_key", suffixes=('_defect', '_task'), how='outer')
merged_data = pd.merge(merged_data, size_facts, on="project_key", suffixes=('', '_size'), how='outer')

# Debug: Print merged dataset
print(merged_data.head())
print(f"Merged Data Shape: {merged_data.shape}")

# Step 3: Prepare the data
features = ['team_key', 'task_plan_time_minutes', 'size_added_and_modified', 'size_deleted', 'task_actual_time_minutes']
target = 'defect_fix_count'

# Verify columns
if not all(col in merged_data.columns for col in features + [target]):
    raise ValueError("One or more required columns are missing from the merged dataset.")

# Select features and target
X = merged_data[features]
y = merged_data[target]

# Debug: Check feature and target statistics
print("Selected Features Columns:", features)
print(f"Target Column: {target}")
print("Number of Non-Null Records in X:", X.notnull().sum())
print("Number of Non-Null Records in y:", y.notnull().sum())

# Fill missing values
X = X.fillna(0)
y = y.fillna(0)

# Ensure data is not empty
if X.empty or y.empty:
    raise ValueError("The dataset has insufficient data after processing. Please check the input files.")

# Step 4: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 6: Predict and evaluate the model
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Model: Random Forest (Defect Prediction)")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R² Score: {r2}")


   project_key  defect_fix_count  team_key  task_actual_time_minutes  \
0           23               1.0       NaN                       NaN   
1           23               1.0       NaN                       NaN   
2           23               1.0       NaN                       NaN   
3           23               1.0       NaN                       NaN   
4           23               1.0       NaN                       NaN   

   task_plan_time_minutes  size_added_and_modified  size_deleted  
0                     NaN                      1.0           0.0  
1                     NaN                      2.0           0.0  
2                     NaN                     10.0           0.0  
3                     NaN                     16.0          10.0  
4                     NaN                      5.0           0.0  
Merged Data Shape: (37179505, 7)
Selected Features Columns: ['team_key', 'task_plan_time_minutes', 'size_added_and_modified', 'size_deleted', 'task_actual_time_minut