<a href="https://colab.research.google.com/github/JustARandomDude4/30DaysJavaScript/blob/master/Anomaly_1/anomoly_detection_in_repos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# load data and preprocess
data = pd.read_csv("contributor_activity.csv")
data['timestamp'] = pd.to_datetime(data['timestamp'])
data = data.sort_values(by=['user_id', 'timestamp'])
data['time_diff'] = data.groupby('user_id')['timestamp'].diff().dt.seconds.fillna(0)
data['commit_diff'] = data.groupby('user_id')['commit_count'].diff().fillna(0)

# set threshold values for anomaly detection
time_diff_threshold = 3600  # 1 hour
commit_diff_threshold = 10  # 10 commits
added_code_threshold = 0.1  # 10% added code difference

# standardize and fit data to Isolation Forest model
scaler = StandardScaler()
X = scaler.fit_transform(data[['time_diff', 'commit_diff', 'added_code_diff']])
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto')
model.fit(X)

# predict anomalies
data['anomaly'] = model.predict(X)
data['anomaly'] = [1 if x == -1 else 0 for x in data['anomaly']]

# check for additional anomalies
for user in data['user_id'].unique():
    user_data = data[data['user_id'] == user]
    user_mean_commit_count = user_data['commit_count'].mean()
    user_std_commit_count = user_data['commit_count'].std()
    user_mean_time_diff = user_data['time_diff'].mean()
    user_std_time_diff = user_data['time_diff'].std()
    user_mean_added_code_diff = user_data['added_code_diff'].mean()
    
    for i in range(len(user_data)):
        if abs(user_data.iloc[i]['commit_count'] - user_mean_commit_count) > user_std_commit_count * commit_diff_threshold:
            data.loc[user_data.index[i], 'anomaly'] = 1
        if abs(user_data.iloc[i]['time_diff'] - user_mean_time_diff) > user_std_time_diff * time_diff_threshold:
            data.loc[user_data.index[i], 'anomaly'] = 1
        if user_data.iloc[i]['added_code_diff'] > added_code_threshold:
            data.loc[user_data.index[i], 'anomaly'] = 1

# output the results in measurable format
anomaly_count = len(data[data['anomaly'] == 1])
total_count = len(data)
anomaly_rate = round((anomaly_count/total_count)*100, 2)

print(f"Anomalies detected: {anomaly_count}")
print(f"Total contributor activity records: {total_count}")
print(f"Anomaly rate: {anomaly_rate}%")


Anomalies detected: 5
Total contributor activity records: 10
Anomaly rate: 50.0%


In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

# Load data
df = pd.read_csv('exampleproject_commits.csv')

# Data preprocessing
df['commit_date'] = pd.to_datetime(df['commit_date'])
df['week_day'] = df['commit_date'].dt.weekday
df['hour'] = df['commit_date'].dt.hour
df['is_external'] = df['contributor_org'] != 'ExampleProject'

# Feature engineering
weekly_commits = df.groupby(['contributor_id', pd.Grouper(key='commit_date', freq='W-MON')]).size().reset_index(name='commits')
avg_weekly_commits = weekly_commits.groupby('contributor_id')['commits'].mean().reset_index(name='avg_commits')
df = pd.merge(df, avg_weekly_commits, on='contributor_id')
df['commits_diff'] = df['commits'] - df['avg_commits']

# Model training
model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
X = df[['week_day', 'hour', 'is_external', 'commits_diff']]
model.fit(X)

# Prediction
y_pred = model.predict(X)

# Results
df['anomaly'] = y_pred
anomalies = df[df['anomaly'] == -1]
num_anomalies = len(anomalies)
print(f'Number of anomalies detected: {num_anomalies}')
if num_anomalies > 0:
    print('Details of anomalies:')
    print(anomalies[['contributor_id', 'commit_id', 'commit_date', 'anomaly']])


Number of anomalies detected: 10
Details of anomalies:
     contributor_id  commit_id         commit_date  anomaly
6                 2          7 2022-01-04 18:00:00       -1
51               11         52 2022-01-04 18:00:00       -1
66               14         67 2022-01-04 18:00:00       -1
91               19         92 2022-01-04 18:00:00       -1
102              21        103 2022-01-08 12:00:00       -1
128              26        129 2022-01-12 06:00:00       -1
144              29        145 2022-01-16 00:00:00       -1
179              36        180 2022-01-16 00:00:00       -1
184              37        185 2022-01-16 00:00:00       -1
191              39        192 2022-01-04 18:00:00       -1


