In [56]:
import pandas as pd

class MetricsCalculator:
    def __init__(self, raw_data):
        """Initializes with raw GitHub data."""
        self.data = raw_data

    def calculate_commit_frequency(self):
        """Calculates commit frequency (commits per day/week/month)."""
        # Assuming raw_data has a 'commit_date' column
        self.data['date'] = pd.to_datetime(self.data['date'])
        commit_freq = self.data.groupby(self.data['date'].dt.to_period('M')).size()
        return commit_freq

    def calculate_pr_merge_rate(self):
        """Calculates PR merge rate (percentage of merged PRs)."""
        # Assuming raw_data has a 'pr_state' column with 'merged' and 'open'
        total_prs = len(self.data[self.data['state'].isin(['open', 'closed'])])
        merged_prs = len(self.data[self.data['state'] == 'closed'])
        merge_rate = (merged_prs / total_prs) * 100
        return merge_rate

    def calculate_issue_resolution_time(self):
        """Calculates average issue resolution time (difference between opened and closed times)."""
        # Assuming raw_data has 'issue_opened' and 'issue_closed' columns
        self.data['issue_opened'] = pd.to_datetime(self.data['issue_opened'])
        self.data['issue_closed'] = pd.to_datetime(self.data['issue_closed'])
        self.data['resolution_time'] = (self.data['issue_closed'] - self.data['issue_opened']).dt.days
        avg_resolution_time = self.data['resolution_time'].mean()
        return avg_resolution_time

In [36]:
if __name__ == "__main__":
    # Load raw data from a CSV for example
    commit_df = pd.read_csv("../data_collection/commits.csv")
    code_reviews_df=pd.read_csv("../data_collection/code_reviews.csv")
    issues_df=pd.read_csv("../data_collection/issues.csv")
    pull_requests_df=pd.read_csv("../data_collection/pull_requests.csv")

In [55]:
close_no = len(pull_requests_df[pull_requests_df["state"] == "closed"])
total_no = 

In [53]:
pull_requests_df[pull_requests_df["state"] == "closed"]

Unnamed: 0,id,state,created_at
0,2034089635,closed,2024-08-23 12:26:18+00:00
1,2034054370,closed,2024-08-23 12:04:02+00:00
2,1985234590,closed,2024-07-24 11:30:53+00:00
3,1966590201,closed,2024-07-12 12:55:49+00:00
4,1948725220,closed,2024-07-02 06:48:59+00:00
5,1842219079,closed,2024-04-26 06:43:07+00:00
7,1779417937,closed,2024-03-19 09:27:59+00:00
9,1661413543,closed,2024-01-02 05:06:35+00:00
10,1578812190,closed,2023-10-30 10:59:22+00:00
11,1560179139,closed,2023-10-17 09:35:05+00:00


In [47]:
df = pull_requests_df

In [48]:
df

Unnamed: 0,id,state,created_at
0,2034089635,closed,2024-08-23 12:26:18+00:00
1,2034054370,closed,2024-08-23 12:04:02+00:00
2,1985234590,closed,2024-07-24 11:30:53+00:00
3,1966590201,closed,2024-07-12 12:55:49+00:00
4,1948725220,closed,2024-07-02 06:48:59+00:00
5,1842219079,closed,2024-04-26 06:43:07+00:00
6,1835999257,open,2024-04-23 06:49:48+00:00
7,1779417937,closed,2024-03-19 09:27:59+00:00
8,1673955022,open,2024-01-11 08:46:24+00:00
9,1661413543,closed,2024-01-02 05:06:35+00:00


In [37]:
calculator = MetricsCalculator(commit_df)
commit_freq = calculator.calculate_commit_frequency()
print("Commit Frequency (monthly):")
print(commit_freq)

Commit Frequency (monthly):
date
2020-01    11
2020-02    31
2020-05     3
2020-06    11
2020-08     3
2020-10     1
2020-11     4
2021-04     9
2022-06     4
2022-07     5
2023-07     1
2023-09     1
2023-10     4
2023-12     1
2024-01     2
2024-03     6
2024-04     2
2024-05     3
2024-06    12
2024-07    18
Freq: M, dtype: int64


  commit_freq = self.data.groupby(self.data['date'].dt.to_period('M')).size()


In [57]:
calculator = MetricsCalculator(pull_requests_df)
pr_merge_rate = calculator.calculate_pr_merge_rate()
print(f"PR Merge Rate: {pr_merge_rate}%")

PR Merge Rate: 90.0%


In [58]:
df

Unnamed: 0,id,state,created_at
0,2034089635,closed,2024-08-23 12:26:18+00:00
1,2034054370,closed,2024-08-23 12:04:02+00:00
2,1985234590,closed,2024-07-24 11:30:53+00:00
3,1966590201,closed,2024-07-12 12:55:49+00:00
4,1948725220,closed,2024-07-02 06:48:59+00:00
5,1842219079,closed,2024-04-26 06:43:07+00:00
6,1835999257,open,2024-04-23 06:49:48+00:00
7,1779417937,closed,2024-03-19 09:27:59+00:00
8,1673955022,open,2024-01-11 08:46:24+00:00
9,1661413543,closed,2024-01-02 05:06:35+00:00


In [None]:
'../data_collection/pull_requests.csv'

In [62]:
import pandas as pd

# Load the CSV file (replace 'pullrequest_data.csv' with your actual file)
df = pd.read_csv('../data_collection/pull_requests.csv')

# Convert 'created_at' column to datetime
df['created_at'] = pd.to_datetime(df['created_at'])

# Define closed_at as current time for closed PRs or still-open PRs
# We'll use the current timestamp for "open" PRs, and for "closed" ones, we'll assume that they are closed now.
df['closed_at'] = df.apply(
    lambda row: pd.Timestamp.now() if row['state'] == 'open' else pd.Timestamp.now(), axis=1
)

# Calculate the duration between 'created_at' and 'closed_at'
df['duration'] = df['closed_at'] - df['created_at']

# Optionally, convert the duration to total days or hours
df['duration_in_days'] = df['duration'].dt.total_seconds() / (24 * 3600)

# Display the DataFrame with the new 'duration' column
print(df[['id', 'state', 'created_at', 'duration', 'duration_in_days']])

# Save the modified data with durations to a new CSV
df.to_csv('pullrequest_data_with_durations.csv', index=False)


TypeError: Cannot subtract tz-naive and tz-aware datetime-like objects.

In [None]:
calculator = MetricsCalculator(issues_df)
issue_resolution_time = calculator.calculate_issue_resolution_time()
print(f"Average Issue Resolution Time: {issue_resolution_time} days")