INITIALIZATION

In [None]:
# Tải các thư viện
%pip install pandas

In [88]:
# Nhập các thư viện
import pandas as pd
import ast

LOADING RAW DATA

In [74]:
# Lấy dữ liệu thô đã lưu trước
commits_df = pd.read_csv('data_raw/commits.csv')
contributors_df = pd.read_csv('data_raw/contributors.csv')
issues_df = pd.read_csv('data_raw/issues.csv', parse_dates=['created_at', 'closed_at'])
pull_requests_df = pd.read_csv('data_raw/pull_requests.csv', parse_dates=['created_at', 'merged_at', 'closed_at'])
languages_df = pd.read_csv('data_raw/languages.csv')
files_df = pd.read_csv('data_raw/files.csv')

DATA CLEANING & PROCESSING

CLEANSE OF COMMITS

In [75]:
# Kiểm tra chung
commits_df

Unnamed: 0,sha,date,author,message,files_changed,files_commited
0,9b6d95c507ce1dbd2c9a368040a2fe5aede08c31,2023-12-30 02:14:39+00:00,Anuken,Bigger stream packet size,1,['core/src/mindustry/net/ArcNetProvider.java']
1,33178c16357ec74094c018ea43fd7e24598419e5,2023-12-29 14:48:05+00:00,Anuken,Added launch hint for Erekir,2,"['core/assets/bundles/bundle.properties', 'cor..."
2,d1d1454a5527cdf68cd9f78f147d57b0338c65bd,2023-12-28 21:36:03+00:00,Anuken,Merge remote-tracking branch 'origin/master',2,"['.github/workflows/push.yml', 'servers_v7.json']"
3,288ab37e6f330b456f6db79fa527dfac1d034079,2023-12-28 21:35:54+00:00,Anuken,Fixed derelict break hint persisting forever,1,['core/src/mindustry/ui/fragments/HintsFragmen...
4,6ce59b74bd46afb78cf543d89e7fd664a6f42e22,2023-12-25 13:56:59+00:00,a-big-fish-fish,Update servers_v7.json (#9408),1,['servers_v7.json']
...,...,...,...,...,...,...
1344,b57cde71482dc2797d473710f5c4ab4cdfb69a22,2023-01-07 16:29:46+00:00,Garen7,Fixes polies helping the enemy team in pvp (#8...,1,['core/src/mindustry/ai/types/BuilderAI.java']
1345,a5fd29e56d03171c5b959a0e1b5820bde72c4f26,2023-01-07 16:28:01+00:00,buthed010203,Fix crash when toString returns null value (#8...,2,['annotations/src/main/java/mindustry/annotati...
1346,c0b819ec9b53dde521ea84080ec0d264ce6e5f01,2023-01-07 16:27:15+00:00,Anuken,Fixed #7952,1,['core/src/mindustry/entities/comp/UnitComp.ja...
1347,30e36d5640030e56c5f002f2274ce57ca19b360b,2023-01-07 16:24:02+00:00,Anuken,Merge remote-tracking branch 'origin/master',2,"['core/assets/maps/stronghold.msav', 'servers_..."


In [76]:
print('Raw data before cleaning: ')
print(commits_df.isnull().sum())
print(commits_df.dtypes)

Raw data before cleaning: 
sha               0
date              0
author            0
message           0
files_changed     0
files_commited    0
dtype: int64
sha               object
date              object
author            object
message           object
files_changed      int64
files_commited    object
dtype: object


In [77]:
# Làm sạch dữ liệu
commits_df = commits_df.dropna(subset=['sha', 'author'])
commits_df['message'] = commits_df['message'].fillna('')
commits_df = commits_df.drop_duplicates(subset='sha')

commits_df['date'] = pd.to_datetime(commits_df['date'])
commits_df.sort_values('date', inplace= True)
commits_df = commits_df.reset_index(drop= True)

In [96]:
# Xử lí dữ liệu

# Tính số commits theo contributors
commits_per_contributor_df = (commits_df['author'].value_counts()).sort_values(ascending=False)

# Tính số commits theo số files thay đổi
files_changed_per_commit_df = commits_df.explode('files_changed')['files_changed'].value_counts().reset_index()
files_changed_per_commit_df.columns = ['file', 'count']

# Tính số commits và contributors cộng dồn theo tháng
commits_df['month'] = commits_df['date'].dt.to_period('M')
commits_per_month = commits_df.groupby('month').size()
contributors_per_month = commits_df.groupby('month')['author'].nunique()
growth_df = pd.DataFrame({
    'Total Commits': commits_per_month.cumsum(),
    'Total Contributors': contributors_per_month.cumsum()
})

# Tạo các edges dựa trên các contributors commit vào cùng một file
selected_commits_df = commits_df.head(400)
selected_commits_df['files_commited'] = selected_commits_df['files_commited'].apply(ast.literal_eval)

file_contributors = {}
for _, row in selected_commits_df.iterrows():
    for file in row['files_commited']:
        if (file not in file_contributors):
            file_contributors[file] = []
        file_contributors[file].append(row['author'])

edges = []
for contributors in file_contributors.values():
    for i in range(len(contributors)):
        for j in range(i + 1, len(contributors)):
            if contributors[i] != contributors[j] and contributors[i] != 'unknown' and contributors[j] != 'unknown':
                edges.append((contributors[i], contributors[j]))

edges_df = pd.DataFrame(edges, columns=['source', 'target'])
edges_df = edges_df.groupby(['source', 'target']).size().reset_index(name='weight')

  commits_per_day_df = commits_df['date'].dt.to_period('D').value_counts().sort_index()
  commits_df['month'] = commits_df['date'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_commits_df['files_commited'] = selected_commits_df['files_commited'].apply(ast.literal_eval)


In [126]:
# Lưu dữ liệu đã xử lí
commits_df.to_csv('data_cleaned/commits.csv', index=False)
commits_per_contributor_df.to_csv('data_cleaned/commits_per_contributor.csv')
files_changed_per_commit_df.to_csv('data_cleaned/files_changed_per_commit.csv', index=False)
growth_df.to_csv('data_cleaned/growth.csv')
edges_df.to_csv('data_cleaned/network_edges.csv', index=False)

CLEANSE OF CONTRIBUTORS

In [21]:
# Kiểm tra chung
contributors_df

Unnamed: 0,login,commits,location,avatar_url
0,Anuken,12032,,https://avatars.githubusercontent.com/u/101000...
1,Quezler,416,Netherlands,https://avatars.githubusercontent.com/u/317927...
2,Epowerj,353,,https://avatars.githubusercontent.com/u/294501...
3,MEEPofFaith,307,Rapidly approaching your location. This is a t...,https://avatars.githubusercontent.com/u/543014...
4,Prosta4okua,267,Ukraine,https://avatars.githubusercontent.com/u/314853...
...,...,...,...,...
376,hexagonrecursion,1,Milky Way,https://avatars.githubusercontent.com/u/526218...
377,0Nera,1,Moscow,https://avatars.githubusercontent.com/u/715364...
378,ARYDESTROYER,1,,https://avatars.githubusercontent.com/u/317764...
379,Arzxq,1,,https://avatars.githubusercontent.com/u/960669...


In [46]:
print("Contributors data before cleaning:")
print(contributors_df.isnull().sum())
print(contributors_df.dtypes)

Contributors data before cleaning:
login                   0
commits                 0
location                0
avatar_url              0
percent_contribution    0
dtype: int64
login                    object
commits                   int64
location                 object
avatar_url               object
percent_contribution    float64
dtype: object


In [None]:
# Làm sạch dữ liệu
contributors_df = contributors_df.dropna(subset=['login', 'avatar_url'])
contributors_df = contributors_df.drop_duplicates(subset='login')

contributors_df['location'] = contributors_df['location'].str.lower()
contributors_df['location'] = contributors_df['location'].fillna('unknown')

In [122]:
# Xử lí dữ liệu

# Tính % đóng góp của mỗi contributor
total_commits = contributors_df['commits'].sum()
contributors_df['percent_contribution'] = (contributors_df['commits'] / total_commits) * 100

# Tính thời gian hoạt động của mỗi contributor
activity_periods_df = commits_df.groupby('author')['date'].agg(['min', 'max'])
activity_periods_df['activity_duration'] = (activity_periods_df['max'] - activity_periods_df['min']).dt.total_seconds() / 3600
# Nhóm các contributors có cùng địa điểm
location_counts_df = contributors_df['location'].value_counts().reset_index()
location_counts_df.columns = ['location', 'contributors_count']

In [123]:
activity_periods_df.dtypes

min                  datetime64[ns, UTC]
max                  datetime64[ns, UTC]
activity_duration                float64
dtype: object

In [124]:
# Lưu dữ liệu đã xử lí
contributors_df.to_csv('data_cleaned/contributors.csv', index= False)
activity_periods_df.to_csv('data_cleaned/activity_periods.csv')
location_counts_df.to_csv('data_cleaned/location_counts.csv', index = False)

CLEANSE OF ISSUES

In [31]:
# Kiểm tra chung
issues_df

Unnamed: 0,number,state,contributor,created_at,closed_at,title,labels
0,9945,open,Galahadagent,2024-06-13 19:11:29+00:00,NaT,While using pixelated graphics messages aren't...,['bug']
1,9944,closed,Vato8090,2024-06-13 12:38:21+00:00,2024-06-13 17:03:44+00:00,Problem,['bug']
2,9942,closed,Redo11,2024-06-13 02:30:48+00:00,2024-06-13 17:18:33+00:00,"Processors get enabled when changing ""map area...",['bug']
3,9941,closed,Makaim20,2024-06-13 00:49:44+00:00,2024-06-13 01:59:54+00:00,Проблема с сохранениями,['bug']
4,9939,closed,Gordost33,2024-06-12 14:04:22+00:00,2024-06-12 21:39:52+00:00,Spawn sector bug,['bug']
...,...,...,...,...,...,...,...
4326,5,closed,skybldev,2017-12-08 04:12:07+00:00,2018-01-18 23:23:26+00:00,Enemy AI stuck in wall AND going wrong way,['unlabeled']
4327,4,closed,skybldev,2017-12-07 02:29:42+00:00,2017-12-30 00:41:01+00:00,[Implemented][Suggestion] A few suggestions th...,['unlabeled']
4328,3,closed,erikbsap,2017-10-07 20:27:05+00:00,2017-10-16 08:13:35+00:00,Doesn't load,['unlabeled']
4329,2,closed,LegusX,2017-05-10 01:57:20+00:00,2017-05-10 12:26:16+00:00,Can't change weapons?,['unlabeled']


In [32]:
print(issues_df.isnull().sum())
print(issues_df.dtypes)

number         0
state          0
contributor    0
created_at     0
closed_at      1
title          0
labels         0
dtype: int64
number                       int64
state                       object
contributor                 object
created_at     datetime64[ns, UTC]
closed_at      datetime64[ns, UTC]
title                       object
labels                      object
dtype: object


In [34]:
# Làm sạch dữ liệu

issues_df = issues_df.drop_duplicates(subset='number')
issues_df['title'] = issues_df['title'].str.lower()

In [47]:
# Xử lí dữ liệu

# Tính số issues đã đóng
closed_issues_df = issues_df[issues_df['closed_at'].notna()]
closed_issues_df['resolution_time'] = (closed_issues_df['closed_at'] - closed_issues_df['created_at']).dt.total_seconds() / 3600

# Tính số issues theo nhãn
label_counts_df = issues_df.explode('labels')['labels'].value_counts().reset_index()
label_counts_df.columns = ['label', 'count']

# Tính số issues theo người đóng góp
issues_per_contributor_df = issues_df['contributor'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  closed_issues_df['resolution_time'] = (closed_issues_df['closed_at'] - closed_issues_df['created_at']).dt.total_seconds() / 3600


In [101]:
# Lưu dữ liệu đã xử lí

issues_df.to_csv('data_cleaned/issues.csv', index=False)
closed_issues_df.to_csv('data_cleaned/closed_issues.csv', index=False)
label_counts_df.to_csv('data_cleaned/label_counts.csv', index=False)
issues_per_contributor_df.to_csv('data_cleaned/issues_per_contributor.csv')

CLEANSE OF PULL REQUESTS

In [33]:
# Kiểm tra chung
pull_requests_df

Unnamed: 0,number,state,created_at,merged_at,closed_at,title
0,9948,open,2024-06-14 15:01:37+00:00,NaT,NaT,Not privileged setrate
1,9947,open,2024-06-13 20:36:52+00:00,NaT,NaT,ConsumeItemList
2,9946,open,2024-06-13 19:12:32+00:00,NaT,NaT,Update servers_v7.json
3,9943,open,2024-06-13 03:15:51+00:00,NaT,NaT,"Sensors, SetProp, and SetRule"
4,9940,open,2024-06-12 15:43:09+00:00,NaT,NaT,Logic Refactor
...,...,...,...,...,...,...
5514,13,closed,2017-12-12 01:22:25+00:00,2017-12-12 04:48:59+00:00,2017-12-12 04:48:59+00:00,Grammatical correction for remaining enemies l...
5515,12,closed,2017-12-11 19:28:55+00:00,NaT,2017-12-11 23:09:07+00:00,Update Tile.java
5516,11,closed,2017-12-11 19:06:21+00:00,NaT,2017-12-11 19:23:42+00:00,Update Block.java
5517,8,closed,2017-12-09 18:46:30+00:00,2017-12-09 20:08:12+00:00,2017-12-09 20:08:12+00:00,TODO list to keep track of some needed things


In [34]:
print(pull_requests_df.isnull().sum())
print(pull_requests_df.dtypes)

number           0
state            0
created_at       0
merged_at     2064
closed_at      180
title            0
dtype: int64
number                      int64
state                      object
created_at    datetime64[ns, UTC]
merged_at     datetime64[ns, UTC]
closed_at     datetime64[ns, UTC]
title                      object
dtype: object


In [None]:
# Làm sạch dữ liệu

pull_requests_df = pull_requests_df.dropna(subset=['number'])
pull_requests_df = pull_requests_df.drop_duplicates(subset='number')

pull_requests_df['title'] = pull_requests_df['title'].str.lower()

In [61]:
# Xử lí dữ liệu

merged_pull_requests_df = pull_requests_df[pull_requests_df['merged_at'].notna()]
merged_pull_requests_df['time_to_merge'] = (pd.to_datetime(merged_pull_requests_df['merged_at']) - pd.to_datetime(merged_pull_requests_df['created_at'])).dt.total_seconds() / 3600

unmerged_pull_requests_df = pull_requests_df[pull_requests_df['merged_at'].isna() & pull_requests_df['closed_at'].notna()]
unmerged_pull_requests_df = unmerged_pull_requests_df.drop('merged_at', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_pull_requests_df['time_to_merge'] = (pd.to_datetime(merged_pull_requests_df['merged_at']) - pd.to_datetime(merged_pull_requests_df['created_at'])).dt.total_seconds() / 3600


In [63]:
# Lưu dữ liệu đã xử lí

pull_requests_df.to_csv('data_cleaned/pull_requests.csv', index=False)
merged_pull_requests_df.to_csv('data_cleaned/merged_pull_requests.csv', index=False)
unmerged_pull_requests_df.to_csv('data_cleaned/unmerged_pull_requests.csv', index=False)

CLEANSE OF PROGRAMMING LANGUAGES

In [39]:
# Kiểm tra chung
languages_df

Unnamed: 0,language,lines
0,Java,5322246
1,GLSL,23164
2,JavaScript,11850
3,Shell,1483
4,Ruby,1140
5,Batchfile,31


In [64]:
# Xử lí dữ liệu
languages_df['language'] = languages_df['language'].apply(lambda x: x if x == 'Java' else 'Other')
languages_df = languages_df.groupby('language', as_index=False).sum()

CLEANSE OF FILES

In [51]:
# Kiểm tra chung
files_df

Unnamed: 0,path,size
0,.gitignore,2563
1,CONTRIBUTING.md,5816
2,ISSUES.md,3289
3,LICENSE,32422
4,README.md,3760
...,...,...
3310,core/src/mindustry/world/blocks/defense/turret...,3894
3311,core/src/mindustry/world/blocks/defense/turret...,1438
3312,core/src/mindustry/world/blocks/defense/turret...,2290
3313,core/src/mindustry/world/blocks/defense/turret...,5813


In [52]:
print("Data before cleaning:")
print(files_df.isnull().sum())
print(files_df.dtypes)

Data before cleaning:
path    0
size    0
dtype: int64
path    object
size     int64
dtype: object


In [66]:
# Làm sạch dữ liệu
files_df = files_df.dropna(subset=['path', 'size'])
files_df = files_df.drop_duplicates(subset='path')

files_df['path'] = files_df['path'].str.lower()
files_df['extension'] = files_df['path'].apply(lambda x: x.split('.')[-1] if '.' in x else 'no_extension')

In [69]:
# Xử lí dữ liệu

# Tính số files theo loại
file_extension_counts_df = files_df['extension'].value_counts()
count_threshold = 0.01 * files_df.shape[0]
file_extension_counts_revised_df = file_extension_counts_df.copy()
file_extension_counts_revised_df[file_extension_counts_df < count_threshold] = 0
file_extension_counts_revised_df['other'] = file_extension_counts_df[file_extension_counts_df < count_threshold].sum()
file_extension_counts_revised_df = file_extension_counts_revised_df[file_extension_counts_revised_df > 0]

# Tính kích thước files theo loại
file_extension_sizes_df = files_df.groupby('extension')['size'].sum()
size_threshold = 0.01 * file_extension_sizes_df.sum()
file_extension_sizes_revised_df = file_extension_sizes_df.copy()
file_extension_sizes_revised_df[file_extension_sizes_df < size_threshold] = 0
file_extension_sizes_revised_df['other'] = file_extension_sizes_df[file_extension_sizes_df < size_threshold].sum()
file_extension_sizes_revised_df = file_extension_sizes_revised_df[file_extension_sizes_revised_df > 0]

In [71]:
# Lưu dữ liệu đã xử lí
files_df.to_csv('data_cleaned/files.csv', index= False)
file_extension_counts_revised_df.to_csv('data_cleaned/file_extension_counts_revised.csv')
file_extension_sizes_revised_df.to_csv('data_cleaned/file_extension_sizes_revised.csv')