# **EDA**

## IMPORTS

In [248]:
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pandas as pd
import numpy as np

## Import data

In [249]:
data_path = Path('../data/cleaned')
data_files = list(data_path.glob('*.csv'))
print(f'{len(data_files)} files found:\n')
for file in data_files:
    print(f'\t-{file.name}')

3 files found:

	-cleaned_commits.csv
	-cleaned_info.csv
	-cleaned_issues.csv


In [250]:
for file in data_files:
    if 'commits' in file.name:
        commits_df = pd.read_csv(data_path/file.name, parse_dates=['author_date', 'commit_date'])
    elif 'issues' in file.name:
        issues_df = pd.read_csv(data_path/file.name, parse_dates=['created_at', 'closed_at', 'updated_at'])
    elif 'info' in file.name:
        info_df = pd.read_csv(data_path/file.name, parse_dates=['created_at', 'updated_at', 'pushed_at' ])


In [251]:
commits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   repo_name       500 non-null    object             
 1   sha             500 non-null    object             
 2   author_name     500 non-null    object             
 3   author_email    500 non-null    object             
 4   author_date     500 non-null    datetime64[ns, UTC]
 5   committer_name  500 non-null    object             
 6   commit_date     500 non-null    datetime64[ns, UTC]
 7   message         500 non-null    object             
 8   message_length  500 non-null    int64              
dtypes: datetime64[ns, UTC](2), int64(1), object(6)
memory usage: 35.3+ KB


In [252]:
issues_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   repo_name            500 non-null    object             
 1   issue_id             500 non-null    int64              
 2   issue_number         500 non-null    int64              
 3   title                500 non-null    object             
 4   state                500 non-null    object             
 5   user_login           500 non-null    object             
 6   created_at           500 non-null    datetime64[ns, UTC]
 7   updated_at           500 non-null    datetime64[ns, UTC]
 8   closed_at            250 non-null    datetime64[ns, UTC]
 9   comments             500 non-null    int64              
 10  author_association   500 non-null    object             
 11  labels_count         500 non-null    int64              
 12  issue_age_days       5

In [253]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   repo_name    5 non-null      object             
 1   stars        5 non-null      int64              
 2   forks        5 non-null      int64              
 3   open_issues  5 non-null      int64              
 4   language     5 non-null      object             
 5   description  5 non-null      object             
 6   created_at   5 non-null      datetime64[ns, UTC]
 7   updated_at   5 non-null      datetime64[ns, UTC]
 8   pushed_at    5 non-null      datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](3), int64(3), object(3)
memory usage: 492.0+ bytes


## Repo summary

In [284]:
# Group issue data by repo_name and perform summary stats:

issues_summary = issues_df.groupby('repo_name').agg({
    'issue_id': 'count',
    'issue_age_days': ['mean', 'median']
})

issues_summary = issues_summary.reset_index() 
issues_summary.columns = (
    f'{top_level}_{bottom_level}' for top_level, bottom_level in issues_summary.columns
)
issues_summary

Unnamed: 0,repo_name_,issue_id_count,issue_age_days_mean,issue_age_days_median
0,apache/airflow,100,6.46,6.0
1,mlflow/mlflow,100,11.23,5.0
2,numpy/numpy,100,12.0,4.5
3,pandas-dev/pandas,100,11.55,10.0
4,scikit-learn/scikit-learn,100,14.9,6.0


In [287]:
# Group commit data by repo_name and perform aggregation:

commits_summary = commits_df.groupby('repo_name').agg({
    'sha': 'count'
})
commits_summary = commits_summary.reset_index().rename(columns ={'sha': 'commit_sha_count','repo_name': 'repo_name_'})
commits_summary


Unnamed: 0,repo_name_,commit_sha_count
0,apache/airflow,100
1,mlflow/mlflow,100
2,numpy/numpy,100
3,pandas-dev/pandas,100
4,scikit-learn/scikit-learn,100


In [275]:
# Group info data by repo_name and perform aggregation:

info_summary = info_df.groupby('repo_name').agg({
    'stars': 'sum'
})

info_summary = info_summary.reset_index().rename(columns={'stars': 'starsgazers_sum'})
info_summary

Unnamed: 0,repo_name,starsgazers_sum
0,apache/airflow,43145
1,mlflow/mlflow,22892
2,numpy/numpy,30782
3,pandas-dev/pandas,47063
4,scikit-learn/scikit-learn,63980


In [291]:
# join all the summaries into one summary:

summary_df = pd.merge(issues_summary, commits_summary, on='repo_name_', how='inner').rename(columns={'repo_name_': 'repo_name'})
summary_df = pd.merge(summary_df, info_summary, on='repo_name', how='inner' )
summary_df

Unnamed: 0,repo_name,issue_id_count,issue_age_days_mean,issue_age_days_median,commit_sha_count,starsgazers_sum
0,apache/airflow,100,6.46,6.0,100,43145
1,mlflow/mlflow,100,11.23,5.0,100,22892
2,numpy/numpy,100,12.0,4.5,100,30782
3,pandas-dev/pandas,100,11.55,10.0,100,47063
4,scikit-learn/scikit-learn,100,14.9,6.0,100,63980


In [326]:
#The number of unique contributors:
commits_df['author_name'].nunique()

153

In [330]:
#Top 5 contributors:
commits_df['author_name'].value_counts().head(5)

author_name
jorenham            26
Copilot             24
jbrockmendel        22
Daniel Lok          16
Joren Hammudoglu    15
Name: count, dtype: int64