In [2]:
import pandas as pd  # panda data handling
from sklearn.model_selection import KFold  # k-fold cross validation

In [4]:
## Accessing Data
# 1 - 4 (* 1. Open the file within Python. 2. Find out what the column header names are. 3. Determine the number of columns. 4. Determine the number of rows.)
github = pd.read_csv("github_teams.csv")  # load github team data
print("\nColumn names:\n", github.columns.tolist()) # look at data
print("\nNumber of columns:", len(github.columns)) # column count
print("\nNumber of rows:", len(github)) #row count


Column names:
 ['name_h', 'Team_type', 'Team_size_class', 'human_members_count', 'bot_members_count', 'human_work', 'work_per_human', 'human_gini', 'human_Push', 'human_IssueComments', 'human_PRReviewComment', 'human_MergedPR', 'bot_work', 'bot_Push', 'bot_IssueComments', 'bot_PRReviewComment', 'bot_MergedPR', 'eval_survival_day_median', 'issues_count']

Number of columns: 19

Number of rows: 608


In [20]:
#5 (5. Determine which columns are categorical and convert them from *object* to *category*.)
github["Team_type"] = github["Team_type"].astype("category")
github["Team_size_class"] = github["Team_size_class"].astype("category") # convert object - category
print("\nDtype after conversion:\n", github[["Team_type", "Team_size_class"]].dtypes)  # check if worked


Dtype after conversion:
 Team_type          category
Team_size_class    category
dtype: object


In [8]:
#6&7 (6. How many unique values does `Team_type` have? 7. How many unique values does `Team_size_class` have?)
print("\nUnique values in Team_type:", github["Team_type"].nunique()) # unique team_type
print("\nUnique values in Team_size_class:", github["Team_size_class"].nunique()) # unique Team_size_class


Unique values in Team_type: 2

Unique values in Team_size_class: 3


In [10]:
#8&9 (8. What is the value of the 63rd row and 6th column? 9. What are the values for the 300th row?)
print("\n63rd row, 6th column:", github.iloc[62, 5]) # value of 63rd row and 6th column
print("\n300th row values:\n", github.iloc[299]) # values in 300th row


63rd row, 6th column: 35

300th row values:
 name_h                      IyfocAGfAHLncCVJUujqTA/A_QZ6HlUb5sRQHhPa7SGzQ
Team_type                                                       human-bot
Team_size_class                                                    Medium
human_members_count                                                     4
bot_members_count                                                       1
human_work                                                           1049
work_per_human                                                     262.25
human_gini                                                       0.448761
human_Push                                                            739
human_IssueComments                                                   213
human_PRReviewComment                                                  91
human_MergedPR                                                          6
bot_work                                                          

In [12]:
#10 ( 10. Using three different methods, select row with index value 595 with 1st, 2nd, 3rd columns.)
print("\nRow 595 (iloc):\n", github.iloc[595, 0:3]) # row 595, way 1
print("\nRow 595 (loc with column names):\n", github.loc[595, github.columns[0:3]]) # row 595, way 2
print("\nRow 595 (iloc list):\n", github.iloc[595, [0, 1, 2]]) # row 595, way 3


Row 595 (iloc):
 name_h             zAh1NECRCquqUJ_-1d6hAw/DET3jTK8hokYfY_neJ1IVQ
Team_type                                              human-bot
Team_size_class                                            Small
Name: 595, dtype: object

Row 595 (loc with column names):
 name_h             zAh1NECRCquqUJ_-1d6hAw/DET3jTK8hokYfY_neJ1IVQ
Team_type                                              human-bot
Team_size_class                                            Small
Name: 595, dtype: object

Row 595 (iloc list):
 name_h             zAh1NECRCquqUJ_-1d6hAw/DET3jTK8hokYfY_neJ1IVQ
Team_type                                              human-bot
Team_size_class                                            Small
Name: 595, dtype: object


In [14]:
#11 (11. Using two different methods, select the row with index value 46 with the 3rd and 7th columns.)
print("\nRow 46 (iloc):\n", github.iloc[46, [2, 6]]) # row 46, columns 3 and 7  way 1
print("\nRow 46 (loc):\n", github.loc[46, [github.columns[2], github.columns[6]]]) # row 46, columns 3 and 7 way 2


Row 46 (iloc):
 Team_size_class       Medium
work_per_human     31.833333
Name: 46, dtype: object

Row 46 (loc):
 Team_size_class       Medium
work_per_human     31.833333
Name: 46, dtype: object


In [22]:
#12 (12. Create a new DataFrame for the column `bot_work` using two different methods.)
bot_work_df1 = github[["bot_work"]]  # 1 -n ew df for bot_work
print("\nFirst 5 from bot_work_df1:\n", bot_work_df1.head())  # check df1
bot_work_df2 = pd.DataFrame(github["bot_work"])  # 2 - new df for bot_work
print("\nFirst 5 from bot_work_df2:\n", bot_work_df2.head())  # check df2


First 5 from bot_work_df1:
    bot_work
0        43
1         0
2         0
3      1972
4       302

First 5 from bot_work_df2:
    bot_work
0        43
1         0
2         0
3      1972
4       302


In [26]:
## Sorting and Ordering data 
# 1. Select `human-bot` teams that have a `bot_members_count` value greater than and equal to 2.
filter1 = github[(github["Team_type"] == "human-bot") & (github["bot_members_count"] >= 2)]  # filter bots >=2
print("\nHuman-bot teams with 2+ bots:\n", len(filter1))  # show length


Human-bot teams with 2+ bots:
 24


In [28]:
# 2. Find the `human` teams that are `Large` and have a `human_gini` value greater than and equal to 0.75.
filter2 = github[(github["Team_type"] == "human") & (github["Team_size_class"] == "Large") & (github["human_gini"] >= 0.75)]
print("\nLarge human teams with gini >= 0.75:\n", len(filter2))  # show count


Large human teams with gini >= 0.75:
 4


In [30]:
# 3. How many teams are in the `Small` or `Large` category?
filter3 = github[github["Team_size_class"].isin(["Small", "Large"])]  # size filter
print("\nTeams in Small or Large category:", len(filter3))  # show count


Teams in Small or Large category: 428


In [32]:
# 4. How many teams are in the `Small` or `Large` cateogry with a `human_gini` value less than and equal to 0.20?
filter4 = github[(github["Team_size_class"].isin(["Small", "Large"])) &  # size filter
                 (github["human_gini"] <= 0.20)]  # gini low
print("\nTeams in Small or Large with gini <= 0.20:", len(filter4))  # show count


Teams in Small or Large with gini <= 0.20: 66


In [34]:
# 5. How many `human-bot` teams are in the `Medium` category?
filter5 = github[(github["Team_type"] == "human-bot") &  # team type
                 (github["Team_size_class"] == "Medium")]  # size medium
print("\nHuman-bot Medium teams:", len(filter5))  # show count


Human-bot Medium teams: 84


In [40]:
# 6. Create a subsample of 50% of your data.
sample_50 = github.sample(frac=0.5, random_state=1)  # get 50 percent sample
print("50% Sample Preview:", sample_50.head())  # show sample rows

50% Sample Preview:                                             name_h  Team_type Team_size_class  \
65   3VFbLRx-am2PA7KH0P_qQQ/JhvGB8Nzuc1DqysPsJ_2EA  human-bot           Small   
237  FJmB0zbVT0ileOMUPtWRIQ/v2hyhTxDNjcQKAdrTbpb-g  human-bot           Small   
147  bi5TY2Z4OSQq3PMs6JnKYA/9b9IqkDK14ketwn88f3hKA  human-bot           Small   
465  spL8LX3lBfeOPK4bBuqFSA/bkk-seJ9inTX7FeK-3cKzw      human          Medium   
187  dONQnlabJ76NyiesZDhgaw/5Z26n_sfY_gu_ELgILfddQ  human-bot           Small   

     human_members_count  bot_members_count  human_work  work_per_human  \
65                     2                  1          79       39.500000   
237                    3                  1           8        2.666667   
147                    3                  2         189       63.000000   
465                    6                  0         322       53.666667   
187                    3                  1         195       65.000000   

     human_gini  human_Push  human_IssueCo

In [50]:
# 7. Create samples for a 8-fold cross validation test.
kf = KFold(n_splits=8)  # set up kfold
print("\nK-Fold Splits:")  # label output
for train_index, test_index in kf.split(github):  # split data
    print("Train:", train_index[:5], "Test:", test_index[:5])  # show indices



K-Fold Splits:
Train: [76 77 78 79 80] Test: [0 1 2 3 4]
Train: [0 1 2 3 4] Test: [76 77 78 79 80]
Train: [0 1 2 3 4] Test: [152 153 154 155 156]
Train: [0 1 2 3 4] Test: [228 229 230 231 232]
Train: [0 1 2 3 4] Test: [304 305 306 307 308]
Train: [0 1 2 3 4] Test: [380 381 382 383 384]
Train: [0 1 2 3 4] Test: [456 457 458 459 460]
Train: [0 1 2 3 4] Test: [532 533 534 535 536]


In [60]:
# 8. Select columns that are numeric and save it as a new DataFrame.
github_numeric = github.select_dtypes(include='number')  # keep numeric columns
print("\nNumeric columns preview:\n", github_numeric.head())  # check if works


Numeric columns preview:
    human_members_count  bot_members_count  human_work  work_per_human  \
0                    2                  1          66       33.000000   
1                    2                  0          62       31.000000   
2                    7                  0         211       30.142857   
3                  234                 12       14579       62.303419   
4                   38                  8        1625       42.763158   

   human_gini  human_Push  human_IssueComments  human_PRReviewComment  \
0    0.287879          29                   33                      4   
1    0.467742          62                    0                      0   
2    0.499661         194                   16                      1   
3    0.738342        1942                11430                   1170   
4    0.666607         203                 1270                    152   

   human_MergedPR  bot_work  bot_Push  bot_IssueComments  bot_PRReviewComment  \
0             

In [64]:
# 9. Remove the columns `bot_PRReviewComment` and `bot_MergedPR` from the DataFrame.
github_numeric_clean = github_numeric.drop(columns=["bot_PRReviewComment", "bot_MergedPR"])  # drop 2 columns
print("Numeric columns after drop:", github_numeric_clean.head())  # checked if updated

Numeric columns after drop:    human_members_count  bot_members_count  human_work  work_per_human  \
0                    2                  1          66       33.000000   
1                    2                  0          62       31.000000   
2                    7                  0         211       30.142857   
3                  234                 12       14579       62.303419   
4                   38                  8        1625       42.763158   

   human_gini  human_Push  human_IssueComments  human_PRReviewComment  \
0    0.287879          29                   33                      4   
1    0.467742          62                    0                      0   
2    0.499661         194                   16                      1   
3    0.738342        1942                11430                   1170   
4    0.666607         203                 1270                    152   

   human_MergedPR  bot_work  bot_Push  bot_IssueComments  \
0               0        43       

In [66]:
# 10. Save the columns `Team_size_class` and `human_members_count` as a new DataFrame.
team_info = github[["Team_size_class", "human_members_count"]]  # slice columns
print("Team info preview:", team_info.head())  # show preview

Team info preview:   Team_size_class  human_members_count
0           Small                    2
1           Small                    2
2           Large                    7
3           Large                  234
4           Large                   38


In [70]:
# 11. Rename these two columns in the new DataFrame.
team_info = team_info.rename(columns={  # rename cols
    "Team_size_class": "TeamSizeClass",  # new name 1
    "human_members_count": "HumanMembers"  # new name 2
})  # apply rename
print("\nRenamed team info preview:\n", team_info.head())  # check result


Renamed team info preview:
   TeamSizeClass  HumanMembers
0         Small             2
1         Small             2
2         Large             7
3         Large           234
4         Large            38
