In [1]:
import requests
import csv
import pandas
import json

In [2]:
# GitLab API endpoint to fetch public projects
url = 'https://gitlab.com/api/v4/projects'

headers = {
    'Authorization': 'glpat-1ge53oFvTudZu_kpjTvg'
}

params = {
    'visibility': 'public',
    'per_page': 100,  # Fetches 100 projects per page
    'order_by': 'last_activity_at',  # Sort by projects that are recently updated or active
    'sort': 'desc',  # Descending order
}

In [3]:
import requests

projects = []
for page in range(1, 101):  # Fetching 10000 repo (100 page each have 100 repo)
    params['page'] = page
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        projects.extend(response.json())
    else:
        print(f"Request failed with status code {response.status_code}")
        break

# After fetching, sort projects by 'star_count' in descending order to approximate popularity
projects_sorted_by_stars = sorted(projects, key=lambda x: x.get('star_count', 0), reverse=True)

In [6]:
import csv
csv_filename = 'repository_features.csv'

feature_names = ['id','contributorsName','contributorsAccount','contributorsPhoto','contributorsRole','repositoryName','repositoryFullName', 'repoUrl','stars', 'forks', 'lastUpdate', 'size', 'topics', 'readmeURL','descriptions','language']

with open(csv_filename, mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(feature_names)

    for repo in (projects):
        id = repo['id']
        contributorsName = repo['namespace']['name']
        contributorsAccount =repo['namespace']['web_url']
        contributorsPhoto = repo['namespace']['avatar_url']
        contributorsRole='owner'
        repositoryName = repo['name']
        repositoryFullName = repo['name_with_namespace']
        repoUrl= repo['web_url']
        stars = repo['star_count']
        if 'forks_count' in repo:
          forks = repo['forks_count']
        else:
          forks = 0
        lastUpdate = repo['last_activity_at']
        size = 0
        description = repo['description']
        topics = ', '.join(repo['topics']) if 'topics' in repo else ''
        if 'readme_url' in repo:
          readmeURL = repo['readme_url']
        descriptions=repo['description']
        language='none'
        row = [id,contributorsName,contributorsAccount,contributorsPhoto,contributorsRole,repositoryName,repositoryFullName, repoUrl,stars, forks, lastUpdate, size, topics, readmeURL,descriptions,language]

        writer.writerow(row)

print(f"Data for {len(projects)} repositories saved to {csv_filename}")


Data for 10000 repositories saved to repository_features.csv


In [8]:
import requests

def fetch_unique_commit_authors_gitlab(repo_id_or_path):
    """
    Fetches unique commit authors (name and email) from a given GitLab repository.
    If the author's email is a no-reply GitLab email, it saves "null" as the email instead.

    Parameters:
    - repo_id_or_path: str or int - The ID or URL-encoded path of the repository on GitLab.

    Returns:
    A set of tuples, each containing the name and email (or "null" for no-reply emails) of a commit author.
    """
    # Replace token with your actual GitLab Personal Access Token
    api_url = f'https://gitlab.com/api/v4/projects/{repo_id_or_path}/repository/commits'
    authors = set()  # To store unique (name, email) pairs

    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        commits = response.json()
        for commit in commits:
            name = commit['author_name']
            email = commit['author_email']
            # Check if email is a no-reply email
            if email.endswith('.noreply.gitlab.com'):
                email = "null"
            authors.add((name, email))
    else:
        print(f"Failed to fetch data: {response.status_code}")

    return authors


In [9]:
# Example
fetch_unique_commit_authors_gitlab(56143151)

{('Gmail aja', 'ajalahya738@gmail.com')}

In [None]:
csv_filename = 'repository_features.csv'
output_csv_filename = 'repository_features_with_contributors_and_commits.csv'

with open(csv_filename, 'r') as input_csv_file:
    csv_reader = csv.DictReader(input_csv_file)
    fieldnames = csv_reader.fieldnames + ['contributors', 'emails']

    with open(output_csv_filename, 'w', newline='') as output_csv_file:
        csv_writer = csv.DictWriter(output_csv_file, fieldnames=fieldnames)
        csv_writer.writeheader()

        for row in csv_reader:
            id = row['id']
            authors = fetch_unique_commit_authors_gitlab(id)
            # Separating names and emails
            names = '; '.join([name for name, email in authors])
            emails = '; '.join([email for name, email in authors])

            row['contributors'] = names
            row['emails'] = emails

            csv_writer.writerow(row)