# Imports

In [116]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [123]:
def language_count(garbage):
    top_tier_languages = {'Python','C', 'C++', 'JavaScript', 'TypeScript', 'Ruby', 'Rust', 'Lua', 'Assembly', 'Zig'}
    return len(set(garbage.replace(' ', '').replace('{', '').replace('}','').replace("\'", '').split(',')) & top_tier_languages)


# Dataset

In [117]:
small_labeled_data = pd.read_csv('resources/req_dataset.csv')
small_labeled_data = np.array(small_labeled_data)
print(small_labeled_data[0])

['stiff' 'https://api.github.com/users/stiff' 'https://github.com/stiff'
 'https://api.github.com/users/stiff/repos' 14 84 21 84
 "{'Python', 'CoffeeScript', 'Ruby', 'JavaScript'}" 21 0]


# Unlabeled dataset


In [118]:
unlabeled_data =  pd.read_csv('resources/labeled_data.csv')
unlabeled_data = np.array(unlabeled_data)[:15000]
print(unlabeled_data.shape)

(15000, 12)


In [124]:
processed_unlabeled_data = np.zeros((15000, 6), dtype=int)
processed_unlabeled_data[:, :4] = unlabeled_data[:, (4,6,7,11)].astype(int)
processed_unlabeled_data[:, -2] = np.array([language_count(langs) for langs in unlabeled_data[:, 9]])
processed_unlabeled_data[200]

array([172,  78,  19,  30,   4,   0])

# Data Processing
- Get the relevant columns i.e., (4,6,7,9,11)

In [120]:
processed_labeled_data = np.zeros((3000, 5), dtype=int)
y = small_labeled_data[:, -1].astype(np.int64)
y.dtype

dtype('int64')

In [121]:
processed_labeled_data[:, :4] = small_labeled_data[:, (4,5,6,9)].astype(int)
processed_labeled_data[:][200]

array([172,  78,  19,  30,   0])

In [125]:
processed_labeled_data[:, -1] = np.array([language_count(langs) for langs in small_labeled_data[:, 8]])
processed_labeled_data.shape
processed_labeled_data[200]


array([172,  78,  19,  30,   4])

In [133]:
scaler = StandardScaler()
processed_labeled_data = scaler.fit_transform(processed_labeled_data)
np.save('resources/processed_labeled_data.npy', {'0' : processed_labeled_data, '1' : y})

In [8]:
y_label = np.zeros((len(y), len(np.unique(y))), dtype=np.int64)
y_label[np.arange(len(y)), y] = 1
y_label.shape


(3000, 2)

In [127]:
# First batch 
first_batch = scaler.fit_transform(processed_unlabeled_data[:8000, :5])
first_batch.shape

(8000, 5)

# Neural network


In [9]:
# @title Neural Network
class Layer_Dense:
  def __init__(self,n_inputs,n_neurons):
    self.weights = 0.01 * np.random.randn(n_inputs,n_neurons)
    self.biases = np.zeros((1,n_neurons))

  def forward(self,inputs):
    self.output = np.dot(inputs, self.weights)+ self.biases

  def parameters(self):
    return self.weights

class Activation_ReLU:
  def forward(self,inputs):
    self.output = np.maximum(0,inputs)

class Activation_Softmax:
  def forward(self,inputs):
    exp_values = np.exp(inputs - np.max(inputs,axis=1,keepdims=True))
    probabilities = exp_values/np.sum(exp_values,axis=1,keepdims=True)
    self.output = probabilities


class Loss_Categoricalcrossentropy():
  def forward(self, probs, y_true):
    samples = len(probs)
    probs_clipped = np.clip(probs, 1e-7, 1 - 1e-7)
    #negative_log_likelihoods = -np.sum(y_true*np.log(probs_clipped))
    #final_loss = np.sum(negative_log_likelihoods)/samples
    correct_confidences = np.sum(probs_clipped * y_true, axis=1)
    n_l_l = -np.log(correct_confidences)
    final_loss = np.mean(n_l_l)
    return final_loss


# Creating the layers and activations

In [10]:
layer1 = Layer_Dense(5,32)
layer2 = Layer_Dense(32,2)
activation1 = Activation_ReLU()
activation2 = Activation_Softmax()

# Forward pass

In [129]:
layer1.forward(processed_labeled_data)
activation1.forward(layer1.output)
a1 = activation1.output
print(a1.shape)
layer2.forward(activation1.output)
activation2.forward(layer2.output)
logits = activation2.output
print(logits.shape)

(3000, 32)
(3000, 2)


# Loss

In [130]:
# loss_function = Loss_Categoricalcrossentropy()
# loss = loss_function.forward(logits,y_label)
# print(f"Loss : {loss}")

y_pred = np.argmax(activation2.output,axis=1)
accuracy = np.mean(y_pred == y)
print(f"Accuracy:{accuracy*100}")


Accuracy:40.400000000000006


# Backward pass

In [13]:
w1 = layer1.weights
w2 = layer2.weights
da2 = (logits - y_label)
dw2 = a1.T @ da2
db2 = np.sum(da2,axis=0)
da1 = da2 @ w2.T
dz1 = da1 * (a1 > 0)
dw1 = processed_labeled_data.T @ dz1
db1 = np.sum(dz1,axis=0)

# Gradient descent

In [14]:

learning_rate = 0.0001
layer1.weights -= learning_rate * dw1
layer2.weights -= learning_rate * dw2

layer1.biases -= learning_rate * db1
layer2.biases -= learning_rate * db2


# Training loop

In [109]:
learning_rate = 0.000009

for _ in range(100):
    layer1.forward(processed_labeled_data)
    activation1.forward(layer1.output)
    a1 = activation1.output
    
    layer2.forward(activation1.output)
    activation2.forward(layer2.output)
    logits = activation2.output
    
    loss_function = Loss_Categoricalcrossentropy()
    loss = loss_function.forward(logits,y_label)
    y_pred = np.argmax(activation2.output,axis=1)
    accuracy = np.mean(y_pred == y)

    print(f"Loss : {loss}, Accuracy:{accuracy*100}")

    w1 = layer1.weights
    w2 = layer2.weights
    da2 = (logits - y_label)
    dw2 = a1.T @ da2
    db2 = np.sum(da2,axis=0)
    da1 = da2 @ w2.T
    dz1 = da1 * (a1 > 0)
    dw1 = processed_labeled_data.T @ dz1
    db1 = np.sum(dz1,axis=0)

    layer1.weights -= learning_rate * dw1
    layer2.weights -= learning_rate * dw2


    layer1.biases -= learning_rate * db1
    layer2.biases -= learning_rate * db2
    

Loss : 0.09172424462977354, Accuracy:96.13333333333334
Loss : 0.09172398414165112, Accuracy:96.13333333333334
Loss : 0.09172372364912659, Accuracy:96.13333333333334
Loss : 0.09172346321739966, Accuracy:96.13333333333334
Loss : 0.09172320277272115, Accuracy:96.13333333333334
Loss : 0.09172294239611388, Accuracy:96.13333333333334
Loss : 0.09172268200387775, Accuracy:96.13333333333334
Loss : 0.09172242166664442, Accuracy:96.13333333333334
Loss : 0.09172216133708543, Accuracy:96.13333333333334
Loss : 0.0917219010378246, Accuracy:96.13333333333334
Loss : 0.09172164077425718, Accuracy:96.13333333333334
Loss : 0.09172138051484202, Accuracy:96.13333333333334
Loss : 0.09172112031419807, Accuracy:96.13333333333334
Loss : 0.09172086009596664, Accuracy:96.13333333333334
Loss : 0.09172059994823754, Accuracy:96.13333333333334
Loss : 0.09172033978742519, Accuracy:96.13333333333334
Loss : 0.09172007967944554, Accuracy:96.13333333333334
Loss : 0.09171981958535491, Accuracy:96.13333333333334
Loss : 0.09

In [82]:
w1 = layer1.weights
w2 = layer2.weights
b1 = layer1.biases
b2 = layer2.biases

In [85]:
weights = {"w1" : w1, "w2" : w2, "b1" : b1, "b2" : b2}
np.save('resources/weights.npy', weights)
