In [1]:
import json
import numpy as np
import pandas as pd
import torch
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report

In [21]:
# Read all the data sets

set1_human = pd.read_json('dataset/set1_human.json')
set1_machine = pd.read_json('dataset/set1_machine.json')
set2_human = pd.read_json('dataset/set2_human.json')
set2_machine = pd.read_json('dataset/set2_machine.json')
set_test = pd.read_json('dataset/test.json')

# Add in label for all data sets
set1_human = set1_human.assign(label=1)
set2_human = set2_human.assign(label=1)
set1_machine = set1_machine.assign(label=0)
set2_machine = set2_machine.assign(label=0)

In [22]:
# Separate and concatenate into 2 sets, set 1 and set 2
set1 = pd.concat([set1_human[:3500], set1_machine])
set2 = pd.concat([set2_human, set2_machine])

# Split the labels and the data for training
x_set1 = pd.DataFrame(set1[['txt', 'prompt']])
y_set1 = pd.DataFrame(set1[['label']])
x_set2 = pd.DataFrame(set2[['txt', 'prompt']])
y_set2 = pd.DataFrame(set2[['label']])

In [23]:
x_set1['prompt_len'] = x_set1['prompt'].apply(lambda x: len(x))
x_set1['txt_len'] = x_set1['txt'].apply(lambda x: len(x))

x_set2['prompt_len'] = x_set2['prompt'].apply(lambda x: len(x))
x_set2['txt_len'] = x_set2['txt'].apply(lambda x: len(x))

In [24]:
#x_combined_set = pd.concat([x_set1, x_set2])

In [25]:
#y_combined_set = pd.concat([y_set1, y_set2])

In [26]:
x_combined_set = x_set1

In [27]:
y_combined_set = y_set1

In [28]:
y_combined_set

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1
...,...
3495,0
3496,0
3497,0
3498,0


In [29]:
vocab = x_combined_set['txt'].tolist() + x_combined_set['prompt'].tolist()

In [30]:
print(vocab[:10])

[[1479, 3457, 2277, 1827, 1493, 2582, 3723, 1479, 4878, 1494, 17, 1492, 1575, 1591, 4412, 2650, 1478, 1479, 2164, 2098, 1479, 1740, 4700, 1623, 1490, 1656, 1523, 2779, 1570, 17, 1518, 2918, 1479, 3602, 2341, 1671, 2915, 2135, 15, 1520, 1497, 1816, 17, 1518, 76, 3206, 2726, 1623, 15, 2230, 2058, 17, 1520, 1497, 4156, 1558, 2868, 15, 2252, 1972, 15, 1478, 1545, 4203, 1738, 1560, 1543, 15, 1556, 3532, 71, 1624, 1507, 1478, 1479, 2413, 2105, 17, 1479, 1556, 85, 2339, 1486, 1562, 1647, 4421, 2209, 1547, 1557, 85, 1490, 1479, 2105, 2281, 80, 1921, 1479, 2358, 15, 3146, 1488, 1479, 4878, 1494, 10, 86, 1485, 1608, 1568, 1638, 1490, 1479, 1799, 17, 1479, 3602, 2169, 1870, 4788, 1478, 1777, 1724, 1534, 1561, 1524, 1498, 1479, 3962, 17, 1533, 2630, 2198, 1479, 1740, 1575, 1591, 2208, 1492, 2726, 17, 1559, 1656, 3015, 15, 2130, 1665, 1885, 1493, 2032, 1477, 2170, 15, 1827, 4234, 1734, 2486, 1478, 3279, 3534, 1500, 1777, 2086, 1507, 1707, 1728, 17, 2246, 2932, 2130, 1500, 1492, 90, 1954, 1493, 1479

In [31]:
from gensim.models import Word2Vec


model = Word2Vec(sentences=vocab, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [32]:
w2v_model =  gensim.models.Word2Vec.load("word2vec.model").wv

x_prompt = x_combined_set['prompt']

x_prompt_vec = []
for sentence in x_prompt:
    sentence_vec = []
    
    for word in sentence:
        sentence_vec.append(w2v_model[word])
    x_prompt_vec.append(np.mean(sentence_vec, axis=0))
x_prompt_vec = np.array(x_prompt_vec)

In [33]:
x_text = x_combined_set['txt']

x_text_vec = []
for sentence in x_text:
    sentence_vec = []
    
    for word in sentence[:256]:
        sentence_vec.append(w2v_model[word])
    x_text_vec.append(np.mean(sentence_vec, axis=0))
x_text_vec = np.array(x_prompt_vec)

In [34]:
prompt_len = np.expand_dims(x_combined_set['prompt_len'].values , axis=1)
txt_len = np.expand_dims(x_combined_set['txt_len'].values , axis=1)

In [35]:
x_set = np.concatenate((x_text_vec, x_prompt_vec, prompt_len, txt_len), axis=1)

In [36]:
x_train, x_test, y_train, y_test= train_test_split(x_set, y_combined_set, test_size=0.1, stratify=y_combined_set)

In [37]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87       370
           1       0.84      0.89      0.86       330

    accuracy                           0.87       700
   macro avg       0.87      0.87      0.87       700
weighted avg       0.87      0.87      0.87       700



  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
import lightgbm as lgb

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 10,
    'learning_rate': 0.001,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'random_state': 42
}

lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_pred, y_test))

  return f(*args, **kwargs)


You can set `force_col_wise=true` to remove the overhead.
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       370
           1       0.84      0.89      0.86       330

    accuracy                           0.87       700
   macro avg       0.87      0.87      0.87       700
weighted avg       0.87      0.87      0.87       700

