In [54]:
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import warnings
import math
import json
import shutil

In [56]:
f = "../outputs/llm"
llm_dirs = os.listdir(f)
if ".DS_Store" in llm_dirs:  llm_dirs.remove(".DS_Store")
f = "../outputs/control"
control_dirs = os.listdir(f)
if ".DS_Store" in control_dirs: control_dirs.remove(".DS_Store")


In [57]:
overlap = set(control_dirs) & set(llm_dirs)

In [58]:
overlap

set()

In [67]:
#classify dirs that overlap between llm and control, mostly overlap because initially used terms data-science when quering llm

control = []
llm = []
for repo in list(overlap):
    repoData = ParseRepo(repo)
    topics = repoData.getGithubTopics()
#     keywords = ["llm","nlp","jax","stable diffusion", "unsupervised learning", "mxnet", "jax","artificial intelligence"]
    keywords_values = repoData.parseReadMe().values()
    if "data" and "science" in repoData.name.lower():
        control.append(repo)
    elif sum(keywords_values) >0:
        llm.append(repo)
    else: 
        control.append(repo)


for repo in control:
    shutil.rmtree("../outputs/llm/"+repo)
    
for repo in llm:
    shutil.rmtree("../outputs/control/"+repo)

In [81]:
class ParseRepo():
    def __init__(self,repo, outputDir="llm"):
        self.outputDir = "../outputs/"+outputDir
        self.name = "/".join(repo.split("*"))
        self.path = os.path.join(self.outputDir, repo)
        self.info = self.readGithubInfo()
        self.forks = self.getForksData()
        
    def getReadMe(self):
        with open(f"{self.path}/README.md", "r") as file:
            readme = file.read().lower()
        return readme
    
    def parseReadMe(self):
        self.keywords = ["llm","nlp","jax","stable diffusion", "unsupervised learning", "mxnet", "jax","artificial intelligence"]
        readme = self.getReadMe()
        keyword_counts = {keyword:readme.count(keyword) for keyword in self.keywords}
        return keyword_counts
    
    def readGithubInfo(self):
        with open(f"{self.path}/info.json", 'r') as f:
            data = json.load(f)
        return data
    
    def getGithubTopics(self):
        info = self.readGithubInfo()
        return info["data"]["topics"]
    
    def getForksData(self):
        df = pd.read_csv(self.path+"/forks.csv", low_memory=False)
        df = df.sort_values("created_at")
        df['created_at'] = pd.to_datetime(df['created_at'])
        return df
    
    def getCumData(self, df):
        df.loc[:, "created_at"] = df["created_at"].dt.to_period('M')
        data = df.groupby("created_at")["id"].count().reset_index()
        data = data.rename(columns={"created_at":"year-mo"})
        data["year-mo"] = data["year-mo"].dt.to_timestamp()
        data["count"] = data.id.cumsum()
        return data
    
    def graphForksOverTime(self, forksCum):
        coefs, y_fit = self.getCurveFit(forksCum)
        forksCum["year-mo"] = forksCum["year-mo"].astype("datetime64[M]")
        sns.lineplot(data=forksCum, x="year-mo", y="count")
        plt.plot(forksCum["year-mo"],y_fit)
        plt.show()
    
    def getForksAfterDate(self,df):
        cutoff_date = pd.Timestamp("2020-06-11", tz='UTC')
        forks_after_date = df[df["created_at"] >= cutoff_date]
        return forks_after_date
    
    def getPercentChange(self, df):
        counts = list(df["count"])
        percent_change = [counts[i]/counts[i-1] for i,x in list(enumerate(counts))[1:]]
        percent_change = [np.nan] + percent_change
        df["Percent Change"] = percent_change
        return df.loc[:,["year-mo","Percent Change"]]
    
    def curve_func(self,x, a, b,c):
        return a * x + b*x**2 + c
    
    def getCurveFit(self, df):
        x = df.index.values
        y = df['count']
        coefs, _ = curve_fit(self.curve_func, x, y)
        y_fit = self.curve_func(x, *coefs)
        residuals = y - y_fit
        res = np.sum(residuals**2)
        tot = np.sum((y - np.mean(y))**2)
        r_squared = 1 - (res / tot)
        return coefs, y_fit

        
    def getCumData(self, df):
        data = df.assign(created_at=df["created_at"].dt.to_period('M'))
        data = data.groupby("created_at")["id"].count().reset_index().rename(columns={"created_at":"year-mo"})
        data = data.assign(year_mo=lambda x: x["year-mo"].dt.to_timestamp(), count=lambda x: x["id"].cumsum())
        # Create a new DataFrame with all the possible year-month combinations
        start_date = data["year-mo"].min()
        end_date = data["year-mo"].max()
        all_dates = pd.date_range(start_date.to_timestamp(), end_date.to_timestamp(), freq="MS").to_period("M")
        all_data = pd.DataFrame({"year-mo": all_dates})
        # Merge the original data with the new data to fill in the missing year-month combinations
        data = all_data.merge(data, on="year-mo", how="left")
        data["count"] = data["count"].fillna(0).cumsum()
        return data.loc[:, ["year-mo", "count"]]
        
    

In [None]:
class Analysis:
    def getCumData(self, df):
        df.loc[:, "created_at"] = df["created_at"].dt.to_period('W')
        df = df.reset_index()
        data = df.groupby("created_at")["index"].count().reset_index()
        data = data.rename(columns={"created_at":"date", "index":"forks"})
#         data["date"] = data["date"].dt.to_timestamp()
        data["date"] = data["date"].apply(lambda x: x.start_time)
        data["count"] = data["forks"].cumsum()
        data = data.drop(0)
        data = data.drop(data.index[-1])
        return data
    
    
    def getPercentChange(self, df):
        counts = list(df["count"])
        percent_change = [counts[i]/counts[i-1] for i,x in list(enumerate(counts))[1:]]
        percent_change = [np.nan] + percent_change
        df["Percent Change"] = percent_change
        return df.loc[:,["date","Percent Change"]]
    
    def graphForksOverTime(self, llmForks, controlForks, y):
        sns.lineplot(data=llmForks, x="date", y=y, label="LLM")
        sns.lineplot(data=controlForks, x="date", y=y, label="control")
        plt.show()
        
    def compileData(self):
        self.LLMForks = pd.DataFrame()
        count = 0
        for repo in llm_dirs:
            count += 1
            if count % 100==0: print(count)
            repoData = ParseRepo(repo, "llm")
            repoforks = repoData.forks[["created_at"]]
            self.LLMForks = pd.concat([self.LLMForks, repoforks], axis=0, ignore_index=True)
       
        self.LLMForks[["created_at"]].to_csv("llm.csv")
        print("finished LLM")
        
        self.ControlForks = pd.DataFrame()
        count = 0
        for repo in control_dirs:
            count += 1
            if count % 100==0: print(count)
            repoData = ParseRepo(repo, "control")
            repoforks = repoData.forks[["created_at"]]
            self.ControlForks = pd.concat([self.ControlForks, repoforks], axis=0, ignore_index=True)
        
        self.ControlForks[["created_at"]].to_csv("control.csv")
        print("finished Control")
        
    def getForksAfterDate(self,df):
        cutoff_date = pd.Timestamp("2010-01-01", tz='UTC')
        stop_date = pd.Timestamp("2023-02-20", tz='UTC')
        forks_after_date = df[(df["created_at"] >= cutoff_date) & (df["created_at"] < stop_date)]
        return forks_after_date
    
    def compare(self):
        self.LLMForks = pd.read_csv("llm.csv",index_col=0)
        self.ControlForks = pd.read_csv("control.csv",index_col=0)
        self.LLMForks["created_at"] = pd.to_datetime(self.LLMForks["created_at"])
        self.ControlForks["created_at"] = pd.to_datetime(self.ControlForks["created_at"])
        
        self.LLMForks = self.getForksAfterDate(self.LLMForks)
        self.ControlForks = self.getForksAfterDate(self.ControlForks)
        
        self.LLMForksCum = self.getCumData(self.LLMForks)
        self.ControlForksCum = self.getCumData(self.ControlForks)
        self.graphForksOverTime(self.LLMForksCum,self.ControlForksCum, "forks")
        self.graphForksOverTime(self.LLMForksCum,self.ControlForksCum, "count")
        
        self.percentChangeLLM = self.getPercentChange(self.LLMForksCum)
        self.percentChangeControl = self.getPercentChange(self.ControlForksCum)
        self.graphForksOverTime(self.percentChangeLLM,self.percentChangeControl, "Percent Change")
        



        
analysis = Analysis()
# analysis.compileData()   
analysis.compare()
analysis.LLMForksCum

In [None]:
analysis = Analysis()

LLMForks = analysis.LLMForks
ControlForks = analysis.ControlForks

In [128]:
LLMForks[["created_at"]].to_csv("llm-200.csv")

In [142]:
LLMForks

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,allow_forking,is_template,web_commit_signoff_required,topics,visibility,forks,open_issues,watchers,default_branch,permissions
0,258778400,MDEwOlJlcG9zaXRvcnkyNTg3Nzg0MDA=,pytorch-YOLOv4,GZQ0723/pytorch-YOLOv4,False,[object Object],https://github.com/GZQ0723/pytorch-YOLOv4,,True,https://api.github.com/repos/GZQ0723/pytorch-Y...,...,True,False,False,,public,0,0,0,master,[object Object]
1,259333317,MDEwOlJlcG9zaXRvcnkyNTkzMzMzMTc=,pytorch-YOLOv4,QiaoSiBo/pytorch-YOLOv4,False,[object Object],https://github.com/QiaoSiBo/pytorch-YOLOv4,Minimal PyTorch implementation of YOLOv4,True,https://api.github.com/repos/QiaoSiBo/pytorch-...,...,True,False,False,,public,0,0,0,master,[object Object]
2,259335055,MDEwOlJlcG9zaXRvcnkyNTkzMzUwNTU=,pytorch-YOLOv4,yuhonghong95721/pytorch-YOLOv4,False,[object Object],https://github.com/yuhonghong95721/pytorch-YOLOv4,Minimal PyTorch implementation of YOLOv4,True,https://api.github.com/repos/yuhonghong95721/p...,...,True,False,False,,public,0,0,0,master,[object Object]
3,259352989,MDEwOlJlcG9zaXRvcnkyNTkzNTI5ODk=,pytorch-YOLOv4,Riwaly/pytorch-YOLOv4,False,[object Object],https://github.com/Riwaly/pytorch-YOLOv4,Minimal PyTorch implementation of YOLOv4,True,https://api.github.com/repos/Riwaly/pytorch-YO...,...,True,False,False,,public,0,0,0,master,[object Object]
4,259353365,MDEwOlJlcG9zaXRvcnkyNTkzNTMzNjU=,pytorch-YOLOv4,123wk45678/pytorch-YOLOv4,False,[object Object],https://github.com/123wk45678/pytorch-YOLOv4,Minimal PyTorch implementation of YOLOv4,True,https://api.github.com/repos/123wk45678/pytorc...,...,True,False,False,,public,0,0,0,master,[object Object]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582471,619119497,R_kgDOJOcDiQ,500-AI-Machine-learning-Deep-learning-Computer...,Collins-Kiptoo/500-AI-Machine-learning-Deep-le...,False,[object Object],https://github.com/Collins-Kiptoo/500-AI-Machi...,500 AI Machine learning Deep learning Computer...,True,https://api.github.com/repos/Collins-Kiptoo/50...,...,True,False,False,,public,0,0,0,main,[object Object]
582472,619298603,R_kgDOJOm_Kw,500-AI-Machine-learning-Deep-learning-Computer...,euhurias/500-AI-Machine-learning-Deep-learning...,False,[object Object],https://github.com/euhurias/500-AI-Machine-lea...,500 AI Machine learning Deep learning Computer...,True,https://api.github.com/repos/euhurias/500-AI-M...,...,True,False,False,,public,0,0,0,main,[object Object]
582473,619341868,R_kgDOJOpoLA,500-AI-Machine-learning-Deep-learning-Computer...,TheWrightDev916/500-AI-Machine-learning-Deep-l...,False,[object Object],https://github.com/TheWrightDev916/500-AI-Mach...,500 AI Machine learning Deep learning Computer...,True,https://api.github.com/repos/TheWrightDev916/5...,...,True,False,False,,public,0,0,0,main,[object Object]
582474,619802170,R_kgDOJPFuOg,500-AI-Machine-learning-Deep-learning-Computer...,moaztayea/500-AI-Machine-learning-Deep-learnin...,False,[object Object],https://github.com/moaztayea/500-AI-Machine-le...,500 AI Machine learning Deep learning Computer...,True,https://api.github.com/repos/moaztayea/500-AI-...,...,True,False,False,,public,0,0,0,main,[object Object]
