<a href="https://colab.research.google.com/github/MalikaIT21277122/TimeSeriesAnalysis/blob/main/TimeSeriesAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Load datasets
defects = pd.read_csv("CESAW_defect_facts.csv")
components = pd.read_csv("CESAW_Component_Data.csv")
time_logs = pd.read_csv("CESAW_time_fact.csv")
tasks = pd.read_csv("CESAW_task_fact.csv")
size_info = pd.read_csv("CESAW_size_facts.csv")
project_summary = pd.read_csv("CESAW_project_summary.csv")
project_facts = pd.read_csv("CESAW_Project_Data.csv")

print("Project Summary Columns:", project_summary.columns)
print("Time Logs Columns:", time_logs.columns)


# Rename the column in project_summary to match time_logs
project_summary.rename(columns={"project key": "project_key"}, inplace=True)

# Merge the datasets using the corrected column
merged_data = pd.merge(project_summary, time_logs, on="project_key", how="inner")

# Preview merged dataset
print(merged_data.head())



Project Summary Columns: Index(['Org', 'project key', 'team size', 'Start Date', 'End Date',
       'A&M [LoC]', 'Effort [Hours]', 'duration[days]'],
      dtype='object')
Time Logs Columns: Index(['time_log_fact_key', 'organization', 'project_key', 'person_key',
       'team_key', 'wbs_element_key', 'plan_item_key', 'time_log_start_date',
       'time_log_end_date', 'time_log_delta_minutes',
       'time_log_interrupt_minutes', 'phase_key', 'phase_short_name',
       'phase.process_key', 'process_name'],
      dtype='object')
  Org  project_key  team size Start Date End Date  A&M [LoC]  Effort [Hours]  \
0   A          615         48      8-Sep   14-Oct     796887         35091.5   
1   A          615         48      8-Sep   14-Oct     796887         35091.5   
2   A          615         48      8-Sep   14-Oct     796887         35091.5   
3   A          615         48      8-Sep   14-Oct     796887         35091.5   
4   A          615         48      8-Sep   14-Oct     796887       

In [15]:
# Drop unnecessary columns and prepare the dataset
data = merged_data.copy()

# Encode categorical columns
label_encoders = {}
for col in ['phase_short_name', 'process_name']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [16]:
# Generate confidence level based on some rules (example heuristic)
def assign_confidence(row):
    if row['duration[days]'] < 500 and row['Effort [Hours]'] < 10000:
        return "High"
    elif row['duration[days]'] < 1000:
        return "Medium"
    else:
        return "Low"

data['confidence_level'] = data.apply(assign_confidence, axis=1)

In [17]:
# Features and target
X = data[['team size', 'Effort [Hours]', 'duration[days]', 'time_log_delta_minutes', 'phase_short_name', 'process_name']]
y = data['confidence_level']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [18]:
# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

        High       1.00      1.00      1.00     17319
         Low       1.00      1.00      1.00      9429
      Medium       1.00      1.00      1.00     10061

    accuracy                           1.00     36809
   macro avg       1.00      1.00      1.00     36809
weighted avg       1.00      1.00      1.00     36809

