# Model Analysis

Below we will analyze the result of our model after being trained for 200,000 steps. 

In [1]:
import re
from transformers import pipeline, set_seed
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

set_seed(42)

In [3]:
model = AutoModelForCausalLM.from_pretrained("razhan/codeqmul")
tokenizer = AutoTokenizer.from_pretrained("razhan/codeqmul")
generation = pipeline('text-generation', model="razhan/codeqmul", device=0)

In [4]:
print(f'Model size: {sum(t.numel() for t in model.parameters())/1000**2:.1f}M parameters')

Model size: 111.2M parameters


In [5]:


def first_block(string):
    return re.split('\nclass|\ndef|\n#|\n@|\nprint|\nif', string)[0].rstrip()

def autocomplete(pipe, prompt, max_length=64, num_completions=4, seed=42):
    set_seed(seed)
    gen_kwargs = {"temperature":0.4, "top_p":0.95, "top_k":0, "num_beams":1, "do_sample":True,}
    code_gens = generation(prompt, num_return_sequences=num_completions,
        max_length=max_length, **gen_kwargs)
    code_strings = []
    for code_gen in code_gens:
        generated_code = first_block(code_gen['generated_text'][len(prompt):])
        code_strings.append(generated_code)
    print(('\n'+'#'*80 + '\n').join(code_strings))

In [9]:
prompt = '''def get_urls_from_html(html):
"""Get all embedded URLs in a HTML string."""'''
autocomplete(generation, prompt, max_length=96)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



urls = []
for link in re.findall(r'<a href="(.+?)"', html):
  url = link[link.find('href') + len('href'):]
  urls.append(url)
return urls
################################################################################

urls = []
for match in re.finditer(r'<a href="(.*?)">', html):
  url = match.group(1)
  urls.append(url)
return urls
################################################################################

urls = []
for match in re.finditer(r'href\s*=\s*["\']([^"\']*)', html):
  url = match.group(1)
  urls.append(url)
################################################################################

urls = []
for link in re.findall(r'<a href="([^"]+)"', html):
  urls.append(link.replace(r'\/', '/'))
return urls


In [25]:
prompt = '''def get_file_size(filepath):'''
autocomplete(generation, prompt, max_length=96)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



    """
    Return the size of a file.

    :param filepath: Path to the file.
    :type filepath: str
    :return: File size in bytes.
    :rtype: int
    """
    with open(filepath, 'rb') as f:
        return os.path.getsize(f.name)
################################################################################

    """
    Returns the size of a file in bytes.

    :param filepath: path to the file to get the size of
    :type filepath: str
    :return: size of the file in bytes
    :rtype: int
    """
    with open(filepath, 'rb') as f:
        return os.path.getsize(filepath)
################################################################################

    """
    Returns the size of a file, in bytes.

    :param filepath: The file to check.
    :type filepath: str
    :returns: The size of the file, in bytes.
    :rtype: int
    """
    with open(filepath, 'rb') as f:
        return os.path.getsize(filepath)
###################################################################

In [13]:
# Flask example
prompt = '''@app.route("/register, method=["POST"]"):'''
autocomplete(generation, prompt, max_length=256)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



    if request.method == "POST":
        username = request.form.get("username")
        password = request.form.get("password")
        email = request.form.get("email")
        first_name = request.form.get("first_name")
        last_name = request.form.get("last_name")
        email_address = request.form.get("email_address")
        password_confirmation = request.form.get("password_confirmation")
        password_confirmation = password_confirmation.strip()
        if password == password_confirmation:
            if username == email:
                flash("You have already registered")
                return redirect(url_for("login"))
            else:
                user = User(username, email, first_name, last_name, password)
                db.session.add(user)
                db.session.commit()
                flash("You have successfully registered")
                return redirect(url_for("index"))
        else:
            flash("Incorrect password")
            return

In [16]:
prompt = '''def count_words(filename):
    """Count the number of occurrences of each word in the file"""'''
autocomplete(generation, prompt, max_length=96)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



    word_count = {}
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if line:
                word, count = line.split()
                word_count[word] = count
    return word_count
################################################################################

    counts = {}
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line[0] == '#':
                continue
            parts = line.split()
            if len(parts)!= 2:
                continue
            counts[parts
################################################################################

    counter = Counter()
    with open(filename, 'r') as f:
        for line in f:
            counter.update(line.lower().split())
    return counter
################################################################################

    counts = {}
    with op

In [18]:
prompt = '''X = np.random.randn(100, 100)
y = np.random.randint(0, 1, 100)
# fit random forest classifier with 30 estimators'''
autocomplete(generation, prompt, max_length=96)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



clf = RandomForestClassifier(n_estimators=30)
clf.fit(X, y)
################################################################################

clf = RandomForestClassifier(n_estimators=30)
clf.fit(X, y)
################################################################################

clf = RandomForestClassifier(n_estimators=30)
clf.fit(X, y)
################################################################################

clf = RandomForestClassifier(n_estimators=30, criterion="entropy", max_depth=None, random_state=0)
clf.fit(X, y)


In [19]:
prompt = '''def load_jsonl(filename):
    """ Load the given gzip jsonl file. """'''
autocomplete(generation, prompt, max_length=96)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



    with gzip.open(filename, 'rb') as f:
        return json.load(f)
################################################################################

    with gzip.open(filename, 'rb') as f:
        return json.load(f)
################################################################################

    with gzip.open(filename, 'rb') as f:
        return json.load(f)
################################################################################

    with gzip.open(filename, 'rb') as f:
        return json.load(f)


In [21]:
prompt = '''class Person:
    def __init__(self, name, age, gender):'''
autocomplete(generation, prompt, max_length=96)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



        self.name = name
        self.age = age
        self.gender = gender

    def __str__(self):
        return f"Name: {self.name}, Age: {self.age}, Gender: {self.gender}"
################################################################################

        self.name = name
        self.age = age
        self.gender = gender

    def __repr__(self):
        return f"Person({self.name}, {self.age}, {self.gender})"
################################################################################

        self.name = name
        self.age = age
        self.gender = gender

    def __str__(self):
        return f"Name: {self.name}, Age: {self.age}, Gender: {self.gender}"
################################################################################

        self.name = name
        self.age = age
        self.gender = gender

    def __repr__(self):
        return "<Person {0}>".format(self.name)


In [30]:
prompt = '''import numpy as np

def mean(a):'''
autocomplete(generation, prompt, max_length=96)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



    return sum(a)/len(a)
################################################################################

    """
    Compute the mean of an array.

    Parameters
    ----------
    a : array_like
        Input array.

    Returns
    -------
    out : ndarray
        The mean of the values in the input array.

    See Also
    --------
    var, std, nanmean, nanstd

    Notes
    -----
    The mean is computed for each column in the input array, then for each
    column the mean is computed for
################################################################################

    return np.mean(a)
################################################################################

    return sum(a)/len(a)


In [33]:
prompt = '''import numpy as np
from sklearn.ensemble import RandomForestClassifier

# create training data
X = np.random.randn(100, 100)
y = np.random.randint(0, 1, 100)

# setup train test split with test_size of 0.3'''
autocomplete(generation, prompt, max_length=256)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
################################################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
################################################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
################################################################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
