In [1]:
import scipy.sparse
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pickle
import gc

In [2]:
labels = ["API", "Blockchain", "Compliance", "Data/ML", "Development", "HR", "Infrastructure",
          "Monetization", "Productivity", "UI", "Security"]

print("Loading data")
labelled_df = pd.read_csv("labelled_old.csv")

labelled_df.dropna(axis=0, inplace=True, subset=["repo", "text"])
labelled_df.reset_index(inplace=True, drop=True)
categories = [" ".join(labelled_df[labelled_df["label"] == label]["text"].tolist()) for label in labels]
print({label: len(labelled_df[labelled_df["label"] == label]) for label in labels})

Loading data
{'API': 44, 'Blockchain': 11, 'Compliance': 2, 'Data/ML': 28, 'Development': 162, 'HR': 15, 'Infrastructure': 147, 'Monetization': 3, 'Productivity': 34, 'UI': 97, 'Security': 25}


In [None]:
unlabelled_df = pd.read_csv("unlabelled.csv")
# unlabelled_df.tail(-100).to_csv("unlabelled.csv")
unlabelled_df = unlabelled_df.head(100)
unlabelled_df.dropna(axis=0, inplace=True, subset=["repo", "text"])
unlabelled_df.reset_index(drop=True, inplace=True)

In [3]:
unlabelled_df = pd.read_csv("unlabelled_all.csv").tail(-100000)
unlabelled_df.dropna(axis=0, inplace=True, subset=["repo", "text"])
unlabelled_df.reset_index(drop=True, inplace=True)

In [4]:
print(len(unlabelled_df))
gc.collect()

131174


76

In [5]:
print("Fitting CV & TFIDF")

corpus = labelled_df["text"].append(unlabelled_df["text"]).tolist()
eng = stopwords.words("english")

cv = CountVectorizer(stop_words=eng, min_df=0.01, max_df=0.1)
tfidf = TfidfTransformer()

print("Fitting CV")
labelled_corpus = labelled_df["text"].tolist()
unlabelled_corpus = unlabelled_df["text"].tolist()
cv.fit(corpus)

print("Fitting TFIDF")
arr = cv.transform(labelled_corpus)
arr = tfidf.fit_transform(arr)

print("Transforming")
unlabelled_arr = tfidf.transform(cv.transform(unlabelled_corpus))

Fitting CV & TFIDF
Fitting CV
Fitting TFIDF
Transforming


In [10]:
print("Dimensionality Reduction")
n_components = 400
svd = TruncatedSVD(n_components=n_components, n_iter=20)
print("Fitting SVD")
svd.fit(scipy.sparse.vstack((arr, unlabelled_arr)))

explained_variance = np.sum(svd.explained_variance_ratio_)
print(f"Explained variance with {n_components} components: {explained_variance * 100}%")

Dimensionality Reduction
Fitting SVD
Explained variance with 400 components: 57.060628104373464%


In [11]:
with open("others.svd", "wb") as f:
    pickle.dump(svd, f)

In [6]:
with open("100000.svd", "rb") as f:
    svd = pickle.load(f)

In [12]:
print("Generating Dataset")
X = svd.transform(arr)
Y = np.zeros(shape=(X.shape[0],))

for i, row in labelled_df.iterrows():
    label = row["label"]
    if not pd.isna(label):
        Y[i] = labels.index(label)

X_out = svd.transform(unlabelled_arr)

Generating Dataset


In [13]:
print(X.shape)

(570, 400)


In [None]:
min_train, min_test = 100, 20
accuracies = []
for n_train in range(min_train, len(X) - min_test):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=n_train)

    classifier = KNeighborsClassifier(n_neighbors=5, weights="distance")
    classifier.fit(X_train, Y_train)

    Y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(Y_test, Y_pred)
    accuracies.append(accuracy)

plt.plot(np.arange(min_train, len(X) - min_test), accuracies, "r--")
plt.show()

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=75)
classifier = KNeighborsClassifier(n_neighbors=5, weights="distance")
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.4666666666666667


In [None]:
plt.hist(Y)
plt.show()

In [15]:
print("Predicting")
knn = KNeighborsClassifier(n_neighbors=5, weights="distance")
knn.fit(X, Y)

Y_proba = knn.predict_proba(X_out)
Y_pred = np.argmax(Y_proba, axis=1)
Y_conf = np.max(Y_proba, axis=1)

Predicting


In [16]:
threshold = 0.401
selection = Y_conf < threshold
n_low_confidence = np.count_nonzero(selection) / len(Y_conf)
print(f"{n_low_confidence * 100}% \"low\" confidence")

36.411179044627744% "low" confidence


In [17]:
unlabelled_df["label"] = pd.Series(map(lambda pred: labels[int(pred)], Y_pred))
unlabelled_df["confidence"] = pd.Series(Y_conf, index=unlabelled_df.index)

unlabelled_df = unlabelled_df[["repo", "label", "confidence"]]
unlabelled_df.to_csv("part_two.csv", index=False)

In [None]:
correct = unlabelled_df[np.logical_not(selection)]
check = unlabelled_df[selection]
correct.to_csv("correct.csv", index=False)
check.to_csv("to_check.csv", index=False)

In [None]:
print("Saved")