In [None]:
import pandas as pd
import json
import spacy
import numpy as np
nlp = spacy.load("en_core_web_sm")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
def preprocess(raw_text):
    doc = nlp(raw_text)
    tokens = []
    for token in doc: 
        if not any([token.is_space, token.is_stop, token.is_punct, 
                    token.like_num, token.like_url]):
            tokens.append(token.lemma_.lower())
    return tokens

# Function for text preprocessing
def preprocess_text(text):
    
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords and non-alphanumeric characters
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Train

In [None]:
import json

data=[]
with open('memes/defaults/annotations/train.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    data.append(result)
    #print(f"result: {result}")
    #print(isinstance(result, dict))
    
data = pd.DataFrame(data)
data

In [None]:
data["processed_text"] = data["text"].apply(preprocess).apply(lambda x: " ".join(x))
data["processed_text_alt"] = data['text'].apply(preprocess_text)

data


In [None]:
data['binary_labels'] = np.where(data['labels'].apply(lambda x: 'not harmful' in x), 0, 1)

In [None]:
# Fill NaN values with an empty string
data['text'] = data['text'].fillna('')
data['processed_text_alt'] = data['processed_text_alt'].fillna('')
data['processed_text'] = data['processed_text'].fillna('')

In [None]:
data.to_csv("data_train_preprocessed.csv")

# Test

In [None]:
import json

data=[]
with open('memes/defaults/annotations/test.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    data.append(result)
    #print(f"result: {result}")
    #print(isinstance(result, dict))
    
data = pd.DataFrame(data)
data

In [None]:
data["processed_text"] = data["text"].apply(preprocess).apply(lambda x: " ".join(x))
data["processed_text_alt"] = data['text'].apply(preprocess_text)
data


In [None]:
data['binary_labels'] = np.where(data['labels'].apply(lambda x: 'not harmful' in x), 0, 1)

In [None]:
data

In [None]:
# Fill NaN values with an empty string
data['text'] = data['text'].fillna('')
data['processed_text_alt'] = data['processed_text_alt'].fillna('')
data['processed_text'] = data['processed_text'].fillna('')

In [None]:
data.to_csv("data_test_preprocessed.csv")