In [1]:
import modelbit
mb = modelbit.login()

In [2]:
import os
from datasets import load_dataset

def load_ds_and_idx(idx, ds_folder, index_folder):
        files = os.listdir(ds_folder)
        files.sort()
        vectorDB = load_dataset('csv', data_files=f"{ds_folder}/{files[idx]}", split='train')
        files = os.listdir(index_folder)
        files.sort()
        vectorDB.load_faiss_index('embedding', f"{index_folder}/{files[idx]}")
        return vectorDB

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def embed_documents(examples):
        embedding = embedder.encode(examples['text'])
        return {'embedding': embedding}

In [4]:
import numpy as np

def get_relevant_documents(query: str, knn: int, vectorDB):
        embedding = embedder.encode(query)
        q = np.array(embedding, dtype=np.float32)
        _, samples = vectorDB.get_nearest_examples("embedding", q, k=knn)
        return [samples]

In [5]:
def _get_git(user_token: str):
        from github import Github, Auth

        auth = Auth.Token(user_token)
        g = Github(auth=auth)

        repos = []
        for repo in g.get_user().get_repos():
            repos.append([f"https://{user_token}@github.com/{repo.full_name}.git", f"{'private' if repo.private else 'public'}", f"{repo.full_name}"])
            # print(dir(repo))
        return repos

In [6]:
def _get_slack_diff(user_id: str = None, slack_token: str = None):
        import slack_sdk
        import pandas as pd

        client=slack_sdk.WebClient(token=slack_token)
        dm_channels_response = client.conversations_list(types="im")
        
        all_messages = {}

        for channel in dm_channels_response["channels"]:
            # Get conversation history
            history_response = client.conversations_history(channel=channel["id"])

            # Store messages
            all_messages[channel["id"]] = history_response["messages"]

        txts = []

        for channel_id, messages in all_messages.items():
            for message in messages:
                try:
                    text = message["text"]
                    user = message["user"]
                    timestamp = message["ts"]
                    txts.append([timestamp,user,text])
                except:
                    pass
        new_df = pd.DataFrame(txts)
        new_df.columns =  ['timestamp','user','text']
        self_user = new_df['user'].value_counts().idxmax()
        new_df = new_df[new_df.user == self_user]

        try:
            files = os.listdir("./scores/user_slack_data/")
            file = [i for i in files if user_id in i]
            old_df = pd.read_csv(f"./scores/user_slack_data/{file[0]}")
            old_df = old_df['text'].values.tolist()
            
            messages = pd.concat([new_df, old_df], ignore_index=True).drop_duplicates('timestamp')
        except Exception as e:
            print(e)
            messages = new_df
        return messages

In [7]:
from langchain import LlamaCpp, PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import json

callbacks = CallbackManager([StreamingStdOutCallbackHandler])

llm = LlamaCpp(
        model_path="./test_model/mistral-7b-instruct-v0.1.Q4_0.gguf",
        temperature=0.75,
        max_tokens=100,
        top_p=1,
        callback_manager=callbacks,
        verbose=True,
        n_ctx=32000,
        n_gpu_layers=100,
        n_batch=512,
        n_threads=1,
        seed=8855,
    )

def archetype_score(user_token: str, repo_name: str, verbose: bool = True):
        from github import Github, Auth

        template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
        ###Instruction:
        You are an expert programming assistant who is also an expert in the Star Wars universe and all the main heroes and villains from all the movies and shows. You will tell the truth, even if the truth is that you don't know.
        Given a person's GitHub activity data in JSON format, you must determine what Jedi class does the user fit based on their GitHub activity data (the four classes in order from best to worst are Jedi Master, Jedi Knight, Jedi Apprentice, and Jedi Padawan).
        The data includes the following fields:
        -repo_name: this has the name of the GitHub repository that the user has worked on.
        -branch_name: this has the name of each branch in the GitHub repository that the user has worked on.
        -commit_count: this shows the number of commits the user has made in the respective branch.
        -pull_count: this shows the number of pulls the user has made in the respective branch.
        -pull_file_count: this shows the total number of files affected by the user's pulls made in the respective branch.

        ###Input:
        GitHub Activity Data:
        {data}

        ###Example Response:
        Jedi Knight, Anakin Skywalker, because you get a lot done as evidenced by the number of your commits.
        Jedi Master, Jocasta Nu, because you have large quantity of pull requests.
        Jedi Padawan, Qui Gon Jinn, because you have very few repositories in your activity history, but you have great potential to grow.

        ###Response:
        The Star Wars class and character that personifies this person is"""

        if verbose:
            print("Authenticating...")
        auth = Auth.Token(user_token)
        g = Github(auth=auth)
        user_login = g.get_user().login

        data = []
        repo = ''
        try:
            repo = g.get_repo(repo_name)
        except:
            if verbose:
                print(f"{repo_name} not found")
            # obj = json.dumps({"repo": "not found"}, indent=4)
            return f"{repo_name} not found"
        
        if(repo):
            # repo name
            if verbose:
                print('Looking at data from repo ', repo.name)
            repo_name = {"repo_name": repo.name}
            # Date of last push
            # print('Pushed at:', repo.pushed_at)
            # pushed_at = repo.pushed_at
            has_branch = False
            if verbose:
                print(f'Retrieving data from {repo.name}')
            for branch in repo.get_branches():
                # goes through each branch
                if len(branch.name) > 0:
                    has_branch = True
                branch_name = branch.name
            commit_count = 0
            if has_branch == True:
                for commit in repo.get_commits():
                    if verbose:
                        print('Retrieving your commits...')
                    author = str(commit.author)
                    if (user_login in author) == True:
                        # number of commits by user
                        commit_count += 1
            pull_count = 0
            pull_file_count = 0
            for pull in repo.get_pulls():
                #number of pulls and num files changed in each pull
                pull_count =+ 1
                pull_count =+ pull.changed_files
            item = {"repo_name": repo_name,
                    # "pushed_at": pushed_at,
                    "branch_name": branch.name,
                    "commit_count": commit_count,
                    "pull_count": pull_count,
                    "pull_file_count": pull_file_count}
            data.append(item)

        gitData = json.dumps(data)
        arch_prompt_template = PromptTemplate(input_variables=['data'], template=template)
        
        if verbose:
            print("Running chain")
        arch_chain = LLMChain(llm=llm, prompt=arch_prompt_template)
        archetype = arch_chain.run(gitData)

        # obj = json.dumps({"archetype": archetype})
        # return obj
        return archetype

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ./test_model/mistral-7b-instruct-v0.1.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q4_0     [ 14336,  

In [None]:
mb.deploy(archetype_score)

In [15]:
from collections import Counter
from shutil import rmtree
import subprocess

def designpatterns_local_score(repo_url: str = "", verbose: bool = True):
    vectorDB = load_dataset('csv', data_files="2_design_patterns_embedded_dataset.csv", split='train')
    vectorDB.load_faiss_index('embedding', '2_designpattern_index.faiss')

    repo_path = "/tmp/curr_repo"
    rmtree(repo_path, ignore_errors=True)
    result = subprocess.run(["git", "clone", "--depth=1", repo_url, repo_path])
    repo_files = [os.path.join(repo_path, f) for f in os.listdir(repo_path)]
    contents = [f for f in repo_files if os.path.isfile(f)]
    files = {}
    for file in contents:
        try:
            with open(file, 'r') as f:
                content = f.read()
                embed = embedder.encode(content)
                query = np.array(embed, dtype=np.float32)
                score, samples = vectorDB.get_nearest_examples('embedding', query, k=1)
                files[file] = {
                    'score': score, 
                    'samples': samples, 
                    'content': content, 
                    }
        except UnicodeDecodeError:
            pass # binary or ascii file

    pp = []
    scores = []
    patterns = []
    resources = []
    avgs = {}
    for k, v in files.items():
        scores.append(v['score'])
        patterns.append(v['samples']['Design Pattern'])
        if v['samples']['Design Pattern'][0] not in avgs:
            avgs[v['samples']['Design Pattern'][0]] = [v['score']]
        else:
            avgs[v['samples']['Design Pattern'][0]] += [v['score']]
        # try: 
        #     pp.append(f"Name: {k.split('/')[-1]} | Score: {v['score']} | Closest: {v['samples']['Language']} {v['samples']['Design Pattern']} | Model: {v['model_out']} |")
        # except KeyError:
        if v['score'] > 1.0:
            pp.append(
                {
                    "name": k.split('/')[-1], 
                    "score": float(v['score']), 
                    "language": v['samples']['Language'][0], 
                    "pattern": v['samples']['Design Pattern'][0],
                    "resource": v['samples']['Unnamed: 4'][0],
                }
            )
        else:
            pp.append(
                {
                    "name": k.split('/')[-1], 
                    "score": float(v['score']), 
                    "language": v['samples']['Language'][0], 
                    "pattern": v['samples']['Design Pattern'][0],
                }
            )

    if verbose:
        print("Getting Average Score and Highest Pattern Likelihood")
    score = float(0)
    if len(scores) > 0:
        score = np.mean(scores)
    eval = score>0.75
    top_pattern = "nothing"
    bot_pattern = "nothing"
    if len(patterns) > 0:
        occurence = Counter()
        for i in patterns:
            occurence.update(i)
        top_pattern = occurence.most_common(3)
        bot_pattern = occurence.most_common()[-3:]
    if len(resources) > 0:
        resource = max(resources, key=resources.count)
    else:
        resource = "No resource"
    for key in avgs.keys():
        avgs[key] = float(sum(avgs[key])/len(avgs[key]))
    
    rmtree("/tmp/curr_repo", ignore_errors=True)

    if verbose:
        print({
        "design_pattern": bool(eval), 
        "repo_url": repo_url, 
        "overall_score": str(score), 
        "top_3_patterns": top_pattern,
        "bot_3_patterns": bot_pattern, 
        "resource": resource, 
        "files": np.asarray(pp).tolist(),
        "occurance": dict(occurence),
        "averages": avgs,
    })
    return {
        "design_pattern": bool(eval), 
        "repo_url": repo_url, 
        "overall_score": str(score), 
        "top_3_patterns": top_pattern,
        "bot_3_patterns": bot_pattern, 
        "resource": resource, 
        "files": np.asarray(pp).tolist(),
        "occurance": dict(occurence),
        "averages": avgs,
    }

In [16]:
mb.deploy(designpatterns_local_score,
          python_packages=['faiss-cpu==1.7.4'],
          system_packages=['git'],
         extra_files={
    './data/2_design_patterns_embedded_dataset.csv': '2_design_patterns_embedded_dataset.csv', 
    "./indexes/2_designpattern_index.faiss": '2_designpattern_index.faiss'
    })

In [None]:
def professionalism_score(slack_token: str = "", user_id: str = None, verbose: bool = True):
    if user_id == None:
        import slack_sdk
    import pandas as pd
    import tiktoken
    tiktoker = tiktoken.encoding_for_model('gpt-3.5-turbo')

    vectorDB = load_dataset('csv', data_files='3_professionalism_embedded_dataset.csv', split='train')
    vectorDB.load_faiss_index('embedding', '3_professionalism_index.faiss')

    number_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    ###Instruction:
    You are an expert witness specializing in empathy, toxicity, and professionalism.
    Given a person's message history, some already-rated examples as context, and a current message, rate the messages on a scale of 1-100 for how professional they are.
    Please respond with only an integer between 1 and 100 where 1 is super toxic, 100 is super professional, and 50 is completely neutral.
    Then give a short explanation of how the person could be more professional.

    ###Input:

    #Examples: {examples}
    
    #Message History: {message_history}
    
    Current Message:
    {current_message}


    ###Response:
    Your Professionalism rating from 1-100 is """

    if user_id == None:
        client=slack_sdk.WebClient(token=slack_token)
        dm_channels_response = client.conversations_list(types="im")
        
        all_messages = {}

        for channel in dm_channels_response["channels"]:
            # Get conversation history
            history_response = client.conversations_history(channel=channel["id"])

            # Store messages
            all_messages[channel["id"]] = history_response["messages"]

        txts = []

        for channel_id, messages in all_messages.items():
            for message in messages:
                try:
                    text = message["text"]
                    user = message["user"]
                    timestamp = message["ts"]
                    txts.append([timestamp,user,text])
                except:
                    pass

        df = pd.DataFrame(txts)
        df.columns =  ['timestamp','user','text']
        self_user = df['user'].value_counts().idxmax()
        df = df[df.user == self_user]
        df.to_csv(f"./scores/user_slack_data/{self_user}_messages.csv")

        messages = df['text'].values.tolist()
    elif user_id and slack_token:
        messages = _get_slack_diff(user_id=user_id, slack_token=slack_token)
    else:
        files = os.listdir("./scores/user_slack_data/")
        file = [i for i in files if user_id in i]
        df = pd.read_csv(f"./scores/user_slack_data/{file[0]}")
        messages = df['text'].values.tolist()

    embeddings_list = []
    for message in messages:
        message = str(message)
        if len(message)>0:
            embed = embedder.encode(message)
            embeddings_list.append(embed)
        else:
            embed = embedder.encode("Likely an emoji")
            embeddings_list.append(embed)
    df['embedding'] = embeddings_list

    message_history = []
    scores = []
    i = 1
    for message in messages:
        message = str(message)
        if verbose:
            print(f"Searching VectorDB for {message[:10]}...")
        if len(message)>0:
            db_query = embedder.encode(message)
        else:
            db_query = embedder.encode("Emoji")
        db_query = np.array(db_query, dtype=np.float32)
        _, context_list = vectorDB.get_nearest_examples("embedding", db_query, k=3)

        if verbose:
            print("Gathering Context...")
        score_string = ""
        for similar_message, rating, comment in zip(
            context_list['text'], 
            context_list['rating'], 
            context_list['comment']
            ):
            score_string += f"Example: {similar_message}, Rating: {rating}, Reasoning: {comment}\n"
        # if verbose:
        #     print(f"Similar Messages from DB: {score_string}")
        
        formatted_prompt_template = PromptTemplate(
            input_variables=['examples', 'message_history', 'current_message'],
            template=number_template
        )
        chain = LLMChain(llm=llm, prompt=formatted_prompt_template)
        if verbose:
            print("Running Chain...")

        dumb_message = "No Message History"
        if len(message_history) == 1:
            dumb_message = message_history[0]
        
        num_tokens = len(tiktoker.encode(f"{score_string},\n" + ",\n".join(message_history) + message))
        while num_tokens > 3800:
            message_history.pop(0)
            num_tokens = len(tiktoker.encode(f"{score_string},\n" + ",\n".join(message_history) + message))
        
        if verbose:
            print(f"Message token count: {num_tokens}")
            print(f"examples_from_db: {score_string}\ncurr_message: {message}")
        obj = chain.run({
            "examples": score_string,
            "message_history": ",\n".join(message_history) if len(message_history) > 1 else dumb_message,
            "current_message": message,
        })
        message_history.append(message)
        scores.append(obj)
        if verbose:
            print(f"Finished Message {i} of {len(messages)}")
        i += 1

    df['scores'] = scores
    df.to_csv(f"./scores/user_slack_data/{user_id if user_id else self_user}_messages.csv")
    return df.to_json()

In [None]:
mb.deploy(professionalism_score,
            system_packages=['cmake'],
            python_packages=['faiss-cpu==1.7.4','llama-cpp-python==0.2.11'],
            extra_files={
                './data/3_professionalism_embedded_dataset.csv': '3_professionalism_embedded_dataset.csv', 
                "./indexes/3_professionalism_index.faiss": '3_professionalism_index.faiss'
        }
        )