<a href="https://colab.research.google.com/github/JustARandomDude4/Anomaly_detect_Algos/blob/master/Anomaly_2/anomaly_detection_in_repos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

# Load data
df = pd.read_csv('exampleproject_commits.csv')

# Data preprocessing
df['commit_date'] = pd.to_datetime(df['commit_date'])
df['week_day'] = df['commit_date'].dt.weekday
df['hour'] = df['commit_date'].dt.hour
df['is_external'] = df['contributor_org'] != 'ExampleProject'

# Feature engineering
weekly_commits = df.groupby(['contributor_id', pd.Grouper(key='commit_date', freq='W-MON')]).size().reset_index(name='commits')
avg_weekly_commits = weekly_commits.groupby('contributor_id')['commits'].mean().reset_index(name='avg_commits')
df = pd.merge(df, avg_weekly_commits, on='contributor_id')
df['commits_diff'] = df['commits'] - df['avg_commits']

# Model training
model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
X = df[['week_day', 'hour', 'is_external', 'commits_diff']]
model.fit(X)

# Prediction
y_pred = model.predict(X)

# Results
df['anomaly'] = y_pred
anomalies = df[df['anomaly'] == -1]
num_anomalies = len(anomalies)
print(f'Number of anomalies detected: {num_anomalies}')
if num_anomalies > 0:
    print('Details of anomalies:')
    print(anomalies[['contributor_id', 'commit_id', 'commit_date', 'anomaly']])


Number of anomalies detected: 10
Details of anomalies:
     contributor_id  commit_id         commit_date  anomaly
6                 2          7 2022-01-04 18:00:00       -1
51               11         52 2022-01-04 18:00:00       -1
66               14         67 2022-01-04 18:00:00       -1
91               19         92 2022-01-04 18:00:00       -1
102              21        103 2022-01-08 12:00:00       -1
128              26        129 2022-01-12 06:00:00       -1
144              29        145 2022-01-16 00:00:00       -1
179              36        180 2022-01-16 00:00:00       -1
184              37        185 2022-01-16 00:00:00       -1
191              39        192 2022-01-04 18:00:00       -1


