In [5]:
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
client = OpenAI()


In [6]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

def get_embedding(text, model="text-embedding-3-small", **kwargs):
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model, **kwargs)
    return response.data[0].embedding

In [23]:
def get_gpt_overview(project_name):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "developer", 
                "content": """
                    When giving an overview of open-source projects, it often helps to categorize them in a way that captures their purpose, governance, maturity, and ecosystem. Below are some common categories you might use:

    1. By Function or Domain
    Operating Systems: Linux distributions, BSD variants, etc.
    Developer Tools & Frameworks: Compilers, build tools, programming libraries, web frameworks, etc.
    Infrastructure & Cloud: Container runtimes (Docker, containerd), orchestration tools (Kubernetes), server management (Ansible, Terraform).
    Data & Analytics: Databases (PostgreSQL, MySQL), data processing frameworks (Apache Spark, Hadoop), machine learning libraries (TensorFlow, PyTorch).
    Desktop & End-User Applications: Web browsers (Firefox), office suites (LibreOffice), multimedia tools (VLC), etc.
    Security Tools: Encryption libraries (OpenSSL), vulnerability scanners (OpenVAS), intrusion detection systems (Snort).
    This helps you group projects by what they do and who might use them.

    2. By Licensing Model
    Permissive: MIT, Apache 2.0, BSD—allow broad usage and fewer redistribution restrictions.
    Copyleft: GPL, AGPL, LGPL—require derivative works to remain open-source under the same license.
    Mixed / Multi-License: Projects offering both open-source and commercial licenses (e.g., “open-core” model).
    License choice can affect community building, commercial adoption, and contribution models.

    3. By Governance & Sponsorship
    Community-Driven: Maintained by volunteers and community contributors (e.g., GIMP, Inkscape).
    Vendor-Driven: Backed or originated by a single company (e.g., Elastic, HashiCorp tools).
    Foundation-Governed: Operated under a formal open-source foundation (e.g., Apache, Eclipse, Linux Foundation).
    Governance type impacts how decisions are made, how contributions are processed, and how stable the project’s future is.

    4. By Project Maturity & Lifecycle
    Early-Stage: Experimental or alpha-stage projects with limited functionality or unstable APIs.
    Growth-Stage: Active community, regular releases, expanding feature set.
    Mature / Established: Widely used, stable APIs, well-documented processes.
    Maintenance / Legacy: Low feature development, mainly security patches or minimal updates.
    Archived / Dormant: No longer actively maintained or supported.
    This categorization shows how actively a project is developed and whether it’s ready for production usage.

    5. By Community Size & Activity
    Small / Niche: One or a few core maintainers, specialized focus.
    Medium: Moderate contributor base, stable user community, some commercial interest.
    Large / Widely Adopted: Hundreds (or thousands) of contributors, global user base, possibly enterprise-level support.
    Understanding community size helps gauge sustainability and support prospects.

    6. By Technical Stack or Ecosystem
    Language-Based Ecosystems: Python libraries vs. Java libraries vs. Node.js packages, etc.
    Platform-Focused: Android, iOS, or cross-platform.
    Cloud & DevOps Toolchains: AWS ecosystem, Kubernetes ecosystem, CI/CD pipelines, etc.
    Projects often cluster around particular languages or platforms—important for developers choosing compatible solutions.

    Putting It All Together
    When giving an overview, you can mix and match these categories to provide a multi-dimensional perspective. For example, you might focus on function/domain first (say, “big data” vs. “developer tooling”), then look at maturity (“active and well-supported vs. brand new beta”), and discuss governance (“foundation-backed vs. vendor-driven”). This framework helps audiences quickly grasp a project’s purpose, community structure, and viability in one cohesive overview.

    based on the above contents, give an overview of projects asked by the user.
                """
            },
            {
                "role": "user", 
                "content": f"give an overview of '{project_name}'"
            }
        ],
        
    )

    return response.choices[0].message.content



In [28]:
def get_project_overview(projects):
    arr = []
    for project in projects:
        data = {
            'project_name': project,
            'overview': get_gpt_overview(project)
        }
        arr.append(data)
    return arr

In [15]:
import pandas as pd
from pathlib import Path

part_path = Path("part-2")

In [17]:
df_train = pd.read_csv(part_path / "train.csv")
df_test = pd.read_csv(part_path / "test.csv")

In [19]:
projects = pd.concat([
    df_train['project_a'],
    df_train['project_b'],
    df_test['project_a'],
    df_test['project_b']
]).unique().tolist()

In [None]:
projects

In [32]:
overviews = get_project_overview(projects)

In [34]:
import json
with open('overviews.json', 'w') as f:
    json.dump(overviews, f, indent=2)

In [47]:
def get_overview_embeddings(projects):
    for project in projects:
        project['embedding'] = get_embedding(project['overview'])
    return projects

In [51]:
emb = get_overview_embeddings(overviews)

In [54]:
df_embeddings = pd.DataFrame(emb)

In [60]:
df_embeddings.tail()

Unnamed: 0,project_name,overview,embedding
112,https://github.com/grandinetech/grandine,Based on the GitHub repository for Grandine (h...,"[-0.029312385246157646, 0.04660499095916748, 0..."
113,https://github.com/quic-go/quic-go,Certainly! Here's an overview of the `quic-go`...,"[-0.010085484944283962, 0.019289527088403702, ..."
114,https://github.com/ethereumjs/ethereumjs-monorepo,The EthereumJS Monorepo is a collection of Jav...,"[-0.01789320446550846, 0.011705568060278893, 0..."
115,https://github.com/alexeyraspopov/picocolors,**Overview of picocolors**\n\n1. **By Function...,"[-0.008400245569646358, -0.010350983589887619,..."
116,https://github.com/ethereum/solc-js,Certainly! Here's an overview of the `solc-js`...,"[-0.01505060214549303, -0.006951489020138979, ..."


In [57]:
df_embeddings.to_csv(part_path / 'processed/project-embeddings.csv', index=False)

In [58]:
df_train = pd.read_csv(part_path / 'train.csv')
df_test = pd.read_csv(part_path / 'test.csv')

In [62]:
df_train = df_train.merge(
    df_embeddings, 
    left_on='project_a', 
    right_on='project_name', 
    how='left',
    suffixes=('', '_a')
)
df_train.drop(columns=['project_name'], inplace=True)

df_train = df_train.merge(
    df_embeddings, 
    left_on='project_b', 
    right_on='project_name', 
    how='left',
    suffixes=('', '_b')
)
df_train.drop(columns=['project_name'], inplace=True)


In [63]:
df_test = df_test.merge(
    df_embeddings, 
    left_on='project_a', 
    right_on='project_name', 
    how='left',
    suffixes=('', '_a')
)
df_test.drop(columns=['project_name'], inplace=True)

df_test = df_test.merge(
    df_embeddings, 
    left_on='project_b', 
    right_on='project_name', 
    how='left',
    suffixes=('', '_b')
)
df_test.drop(columns=['project_name'], inplace=True)

In [64]:
df_train.tail(1)

Unnamed: 0,id,project_a,project_b,weight_a,weight_b,total_amount_usd,funder,quarter,overview,embedding,overview_b,embedding_b
20957,20883,https://github.com/import-js/eslint-plugin-import,https://github.com/webreflection/flatted,0.387597,0.612403,1161,opencollective,2023-10,### Overview of `eslint-plugin-import`\n\n**1....,"[-0.008927255868911743, 0.021994858980178833, ...",**Overview of the Project: Flatted**\n\n### 1....,"[-0.02596249058842659, 0.006722309160977602, -..."


In [65]:
df_train.to_csv(part_path / 'processed/train-embeddings-only.csv', index=False)
df_test.to_csv(part_path / 'processed/test-embeddings-only.csv', index=False)
