In [1]:
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import nltk
import re
from nltk.corpus import stopwords
import itertools


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#get the dataset from Hello-SimpleAI/HC3
dataset = load_dataset("Hello-SimpleAI/HC3", "all")
df = pd.DataFrame(dataset["train"])

#take a look at the dataset
df.head()


Unnamed: 0,id,question,human_answers,chatgpt_answers,source
0,0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...,reddit_eli5
1,1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...,reddit_eli5
2,2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...,reddit_eli5
3,3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...,reddit_eli5
4,4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...,reddit_eli5


In [3]:
#Combine the human answers and chatgpt answers into one column
df = df.melt(value_vars=["human_answers", "chatgpt_answers"], var_name="original_column", value_name="text")

#classify the human answers as 1 and the chatgpt answers as 0
df['classification'] = df['original_column'].apply(lambda x: 1 if x == 'human_answers' else 0)

#drop the original column
df = df.drop(columns=['original_column']).reset_index(drop=True)

#take a look at the dataset
df.head()



Unnamed: 0,text,classification
0,"[Basically there are many categories of "" Best...",1
1,[salt is good for not dying in car crashes and...,1
2,[The way it works is that old TV stations got ...,1
3,[You ca n't just go around assassinating the l...,1
4,[Wanting to kill the shit out of Germans drive...,1


In [4]:
#get the summary of the dataset
df.shape

(48644, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48644 entries, 0 to 48643
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            48644 non-null  object
 1   classification  48644 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 760.2+ KB


In [6]:

df['text'] = df['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

print(df.head())

df['word_count'] = df['text'].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)

print(df.head())

total_words = df['word_count'].sum()

print(f"Total number of words in the text column: {total_words}")




                                                text  classification
0  Basically there are many categories of " Best ...               1
1  salt is good for not dying in car crashes and ...               1
2  The way it works is that old TV stations got a...               1
3  You ca n't just go around assassinating the le...               1
4  Wanting to kill the shit out of Germans drives...               1
                                                text  classification  \
0  Basically there are many categories of " Best ...               1   
1  salt is good for not dying in car crashes and ...               1   
2  The way it works is that old TV stations got a...               1   
3  You ca n't just go around assassinating the le...               1   
4  Wanting to kill the shit out of Germans drives...               1   

   word_count  
0        1225  
1        2080  
2        2458  
3        1030  
4        1507  
Total number of words in the text column: 67074390


In [7]:
#find all the non-words characters and convert them to space, also convert all the text to lower case
df['text'] = df['text'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))
print(df.head())

                                                text  classification  \
0  basically there are many categories of best se...               1   
1  salt is good for not dying in car crashes and ...               1   
2  the way it works is that old tv stations got a...               1   
3  you ca n t just go around assassinating the le...               1   
4  wanting to kill the shit out of germans drives...               1   

   word_count  
0        1225  
1        2080  
2        2458  
3        1030  
4        1507  


In [8]:

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carollu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
all_stopwords = set(stopwords.words('english')) # set the language 

In [10]:
#remove the stopwords from the text
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in all_stopwords]))

df['word_count'] = df['text'].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)

total_words = df['word_count'].sum()

print(f"Total number of words in the text column: {total_words}")

df.head()

Total number of words in the text column: 42607048


Unnamed: 0,text,classification,word_count
0,basically many categories best seller replace ...,1,761
1,salt good dying car crashes car crashes worse ...,1,1309
2,way works old tv stations got certain amount b...,1,1713
3,ca n go around assassinating leaders countries...,1,692
4,wanting kill shit germans drives innovation fr...,1,979


In [11]:
X = df['text']
y = df['classification']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)