In [90]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#### Loading and preparing data

In [91]:
# don't run this every time! re-import the exported csv to save time.
# commented out below to prevent accidental execution.

# dirs = [("DevGPT/" + d) for d in os.listdir("DevGPT") if os.path.isdir("DevGPT/" + d)] # modify this to limit snapshots

# files = []
# for dir in dirs:
#     for f in os.listdir(dir):
#         if f[-5:] == ".json": files.append(dir + "/" + f)

# print(files)

# imports = []
# for f in files:
#     if f[-5:] == ".json":  # this one pulls all files
#         trydf = pd.read_json(f)
#         trydf = pd.json_normalize(trydf["Sources"])
#         trydf["filepath"] = f
#         imports.append(trydf)

# df = pd.concat(imports, ignore_index = True)
# df.to_csv("df-export.csv", index = False)

In [92]:
dfi = pd.read_csv("df-export.csv")

In [93]:
dfa = dfi[["Type", "Author", "Title", "Body", "Message", "CommitMessage", "filepath"]]

# query statements, checking which columns have text to extract

hn_q = "filepath.str.contains('hn_sharings.json')"     # Title
pr_q = "filepath.str.contains('pr_sharings.json')"     # Title, Body
is_q = "filepath.str.contains('issue_sharings.json')"     # Title, Body
di_q = "filepath.str.contains('discussion_sharings.json')"    # Title, Body
co_q = "filepath.str.contains('commit_sharings.json')"         # Message
fi_q = "filepath.str.contains('file_sharings.json')"          # CommitMessage

for query in [hn_q, pr_q, is_q, di_q, co_q, fi_q]:
    print(~dfa.query(query)[["Title", "Body", "Message", "CommitMessage"]].head(1).isna())

   Title   Body  Message  CommitMessage
0   True  False    False          False
     Title  Body  Message  CommitMessage
138   True  True    False          False
     Title  Body  Message  CommitMessage
285   True  True    False          False
     Title  Body  Message  CommitMessage
520   True  True    False          False
     Title   Body  Message  CommitMessage
552  False  False     True          False
     Title   Body  Message  CommitMessage
731  False  False    False           True


In [94]:
# combine text columns

dfa["text"] = dfa[["Title", "Body", "Message", "CommitMessage"]].agg(lambda x: "".join(x.dropna().astype(str)), axis = 1)
dfa = dfa.drop(["Title", "Body", "Message", "CommitMessage", "filepath"], axis = 1)

dfa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfa["text"] = dfa[["Title", "Body", "Message", "CommitMessage"]].agg(lambda x: "".join(x.dropna().astype(str)), axis = 1)


Unnamed: 0,Type,Author,text
0,hacker news,cbowal,OpenAI shuts down its AI Classifier due to poo...
1,hacker news,warrenm,“Devil’s horsemen”: Why Mongol horse archers w...
2,hacker news,ayhanfuat,The Fall of Stack Overflow
3,hacker news,cdme,Death Metal English (2013)
4,hacker news,notRobot,Shopify employee breaks NDA to reveal firm rep...
...,...,...,...
10661,hacker news,ingve,Integrating Zig and SwiftUI
10662,hacker news,scraptor,"Lawyer cites fake cases invented by ChatGPT, j..."
10663,hacker news,admtal,Show HN: A chat bot you can ask anything about...
10664,hacker news,obiefernandez,ChatGPT conversations can be shared publicly


In [96]:
dfa["text_pp"] = dfa["text"].apply(lambda x: gensim.utils.simple_preprocess(x))
dfa = dfa.drop(["text"], axis = 1)

dfa

Unnamed: 0,Type,Author,text_pp
0,hacker news,cbowal,"[openai, shuts, down, its, ai, classifier, due..."
1,hacker news,warrenm,"[devil, horsemen, why, mongol, horse, archers,..."
2,hacker news,ayhanfuat,"[the, fall, of, stack, overflow]"
3,hacker news,cdme,"[death, metal, english]"
4,hacker news,notRobot,"[shopify, employee, breaks, nda, to, reveal, f..."
...,...,...,...
10661,hacker news,ingve,"[integrating, zig, and, swiftui]"
10662,hacker news,scraptor,"[lawyer, cites, fake, cases, invented, by, cha..."
10663,hacker news,admtal,"[show, hn, chat, bot, you, can, ask, anything,..."
10664,hacker news,obiefernandez,"[chatgpt, conversations, can, be, shared, publ..."


#### Apply model - basic

In [37]:
# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    dfa["text_pp"], dfa["Author"] , test_size = 0.2, random_state = 2)

# apply word vector model on training data
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size = 100,
                                   window = 5,
                                   min_count = 2)

words = set(w2v_model.wv.index_to_key)

# create average sentence vectors to train on
X_train_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                         if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                         for ls in X_train])

X_test_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                         if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                         for ls in X_test])

# define random forest model
rf = RandomForestClassifier(n_estimators = 100, random_state = 2)

# fit rf model to vectorized training data
rf.fit(X_train_vect, y_train)

# predict values on vectorized test data
y_pred = rf.predict(X_test_vect)

# report accuracy of test data predictions
print("Test set classification accuracy: %.2f %%" % (accuracy_score(y_test, y_pred) * 100))

Test set classification accuracy: 88.61 %


#### Apply model - optimized

In [78]:
# checking how many rows are removed for minimum comment thresholds
for i in [1, 2, 5, 10, 100]:
    count = len(dfa.groupby("Author").filter(lambda x: len(x) >= i))
    perc = count / len(dfa) * 100
    print("%i rows in dataframe for authors with > %i submissions -- %.1f %% of original." %
          (count, i, perc))

10666 rows in dataframe for authors with > 1 submissions -- 100.0 % of original.
10557 rows in dataframe for authors with > 2 submissions -- 99.0 % of original.
9558 rows in dataframe for authors with > 5 submissions -- 89.6 % of original.
5624 rows in dataframe for authors with > 10 submissions -- 52.7 % of original.
2483 rows in dataframe for authors with > 100 submissions -- 23.3 % of original.


In [81]:
# remove text from authors that have fewer than 5 comments
dfb = dfa.groupby("Author").filter(lambda x: len(x) >= 5)

# include post type as a token 
dfb["text_pp"] = dfb.apply(lambda row: row["text_pp"] + [row["Type"]], axis = 1)

dfb

Unnamed: 0,Type,Author,text_pp
0,hacker news,cbowal,"[openai, shuts, down, its, ai, classifier, due..."
1,hacker news,warrenm,"[devil, horsemen, why, mongol, horse, archers,..."
2,hacker news,ayhanfuat,"[the, fall, of, stack, overflow, hacker news]"
3,hacker news,cdme,"[death, metal, english, hacker news]"
4,hacker news,notRobot,"[shopify, employee, breaks, nda, to, reveal, f..."
...,...,...,...
10661,hacker news,ingve,"[integrating, zig, and, swiftui, hacker news]"
10662,hacker news,scraptor,"[lawyer, cites, fake, cases, invented, by, cha..."
10663,hacker news,admtal,"[show, hn, chat, bot, you, can, ask, anything,..."
10664,hacker news,obiefernandez,"[chatgpt, conversations, can, be, shared, publ..."


##### Tuning w2v_model parameters:

In [82]:
X_train, X_test, y_train, y_test = train_test_split(
    dfb["text_pp"], dfb["Author"] , test_size = 0.2, random_state = 2) 
# included Type column in text column this time for additional predictive power

skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2)

for vec, win in [[50, 5], [50, 10], [50, 15],
                 [100, 5], [100, 10], [100, 15],
                 [200, 5], [200, 10], [200, 15]]:

    scores = []
    for train_index, test_index in skf.split(X_train, y_train):

        X_tr = X_train.iloc[train_index]
        X_te = X_train.iloc[test_index]
        y_tr = y_train.iloc[train_index]
        y_te = y_train.iloc[test_index]

        w2v_model = gensim.models.Word2Vec(X_tr,
                                        vector_size = vec,
                                        window = win,
                                        min_count = 1)
        # not too much extra to include all words: 12394 for count 1, 11685 for count 2 (default)

        words = set(w2v_model.wv.index_to_key)

        X_train_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                                if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                                for ls in X_tr])

        X_test_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                                if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                                for ls in X_te])

        rf = RandomForestClassifier(n_estimators = 100, random_state = 2)

        # fit rf model to vectorized training data
        rf.fit(X_train_vect, y_tr)

        # predict values on vectorized test data
        y_pred = rf.predict(X_test_vect)

        scores.append(accuracy_score(y_te, y_pred))
    # report accuracy of test data predictions
    print("For model with vector size %i and window size %i:\nTest set classification accuracy: %.3f , std dev: %.3f" % (vec, win, np.mean(scores), np.std(scores)))



For model with vector size 50 and window size 5:
Test set classification accuracy: 0.915 , std dev: 0.005




For model with vector size 50 and window size 10:
Test set classification accuracy: 0.915 , std dev: 0.005




For model with vector size 50 and window size 15:
Test set classification accuracy: 0.915 , std dev: 0.005




For model with vector size 100 and window size 5:
Test set classification accuracy: 0.916 , std dev: 0.006




For model with vector size 100 and window size 10:
Test set classification accuracy: 0.914 , std dev: 0.005




For model with vector size 100 and window size 15:
Test set classification accuracy: 0.916 , std dev: 0.004




For model with vector size 200 and window size 5:
Test set classification accuracy: 0.915 , std dev: 0.004




For model with vector size 200 and window size 10:
Test set classification accuracy: 0.914 , std dev: 0.005




For model with vector size 200 and window size 15:
Test set classification accuracy: 0.916 , std dev: 0.005


In [86]:
X_train, X_test, y_train, y_test = train_test_split(
    dfb["text_pp"], dfb["Author"] , test_size = 0.2, random_state = 2) 

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size = 50,
                                   window = 10,
                                   min_count = 1)

words = set(w2v_model.wv.index_to_key)

X_train_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                         if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                         for ls in X_train])

X_test_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                         if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                         for ls in X_test])

# define possible hyperparameters to tune
rf_params = {"criterion": ["gini", "entropy", "log_loss"],
             "max_features": ["sqrt", "log2", None],
             "n_estimators": [10, 20, 50, 100],
           "max_depth": range(1, 20)}

rf_tuned = RandomizedSearchCV(RandomForestClassifier(), rf_params, n_iter = 10, cv = skf,
                               random_state = 2, verbose = 1)

rf_tuned.fit(X_train_vect, y_train)

y_pred = rf_tuned.predict(X_test_vect)

print("Test set classification accuracy: %.2f %%" % (accuracy_score(y_test, y_pred) * 100))

Fitting 5 folds for each of 10 candidates, totalling 50 fits




Test set classification accuracy: 93.25 %


In [89]:
print(rf_tuned.best_score_)
rf_tuned.best_params_

0.9135496308835285


{'n_estimators': 20,
 'max_features': 'sqrt',
 'max_depth': 15,
 'criterion': 'log_loss'}

In [97]:
# try to predict category?

X_train, X_test, y_train, y_test = train_test_split(
    dfa["text_pp"], dfa["Type"] , test_size = 0.2, random_state = 2) 

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size = 50,
                                   window = 10,
                                   min_count = 1)

words = set(w2v_model.wv.index_to_key)

X_train_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                         if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                         for ls in X_train])

X_test_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0) 
                         if any(i in words for i in ls) else np.zeros(w2v_model.vector_size)
                         for ls in X_test])

# define possible hyperparameters to tune
rf_params = {"criterion": ["gini", "entropy", "log_loss"],
             "max_features": ["sqrt", "log2", None],
             "n_estimators": [10, 20, 50, 100],
           "max_depth": range(1, 20)}

rf_tuned = RandomizedSearchCV(RandomForestClassifier(), rf_params, n_iter = 10, cv = skf,
                               random_state = 2, verbose = 1)

rf_tuned.fit(X_train_vect, y_train)

y_pred = rf_tuned.predict(X_test_vect)

print("Test set classification accuracy: %.2f %%" % (accuracy_score(y_test, y_pred) * 100))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Test set classification accuracy: 98.31 %
