In [71]:
import pandas as pd
import json
import spacy
import numpy as np
nlp = spacy.load("en_core_web_sm")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [72]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /home/luca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/luca/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [73]:
import json

data=[]
with open('memes/defaults/annotations/train.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    data.append(result)
    #print(f"result: {result}")
    #print(isinstance(result, dict))
    
data = pd.DataFrame(data)
data

Unnamed: 0,id,image,labels,text
0,covid_memes_18,covid_memes_18.png,"[somewhat harmful, individual]",Bernie or Elizabeth?\nBe informed.Compare them...
1,covid_memes_19,covid_memes_19.png,"[somewhat harmful, organization]",Extending the\nBrexit deadline until\nOctober ...
2,covid_memes_252,covid_memes_252.png,[not harmful],kwai\ngkwa 0964\n#nnevvy\napplause to Thais fr...
3,covid_memes_255,covid_memes_255.png,[not harmful],"So, I order this\nfoce mask to\nprotect ogains..."
4,covid_memes_20,covid_memes_20.png,"[somewhat harmful, individual]",best candidate for\nJA\n2020\njoe biden\nKamal...
...,...,...,...,...
3008,covid_memes_5417,covid_memes_5417.png,[not harmful],Jim Halpert\n@JimHalpert\neverybody: 2020 is f...
3009,covid_memes_5418,covid_memes_5418.png,[not harmful],litquidity\nelihcapital\nyofollewine\n*covid 1...
3010,covid_memes_5419,covid_memes_5419.png,[not harmful],meta\nMe sending my dog out for supplies since...
3011,covid_memes_5420,covid_memes_5420.png,[not harmful],People born in March/April in the\nboveteojoe ...


In [74]:
def preprocess(raw_text):
    doc = nlp(raw_text)
    tokens = []
    for token in doc: 
        if not any([token.is_space, token.is_stop, token.is_punct, 
                    token.like_num, token.like_url]):
            tokens.append(token.lemma_.lower())
    return tokens

# Function for text preprocessing
def preprocess_text(text):
    
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords and non-alphanumeric characters
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [75]:
data["processed_text"] = data["text"].apply(preprocess).apply(lambda x: " ".join(x))
data["processed_text_alt"] = data['text'].apply(preprocess_text)

data


Unnamed: 0,id,image,labels,text,processed_text,processed_text_alt
0,covid_memes_18,covid_memes_18.png,"[somewhat harmful, individual]",Bernie or Elizabeth?\nBe informed.Compare them...,bernie elizabeth inform compare issue matter i...,bernie elizabeth issue matter issue make danke...
1,covid_memes_19,covid_memes_19.png,"[somewhat harmful, organization]",Extending the\nBrexit deadline until\nOctober ...,extend brexit deadline october order ensure de...,extending brexit deadline october 31st order e...
2,covid_memes_252,covid_memes_252.png,[not harmful],kwai\ngkwa 0964\n#nnevvy\napplause to Thais fr...,kwai gkwa nnevvy applause thais hong kong thai...,kwai gkwa 0964 nnevvy applause thai hong kong ...
3,covid_memes_255,covid_memes_255.png,[not harmful],"So, I order this\nfoce mask to\nprotect ogains...",order foce mask protect ogainst fhe corond vir...,order foce mask protect ogainst fhe corond vir...
4,covid_memes_20,covid_memes_20.png,"[somewhat harmful, individual]",best candidate for\nJA\n2020\njoe biden\nKamal...,good candidate ja joe biden kamala harris bern...,best candidate ja 2020 joe biden kamala harris...
...,...,...,...,...,...,...
3008,covid_memes_5417,covid_memes_5417.png,[not harmful],Jim Halpert\n@JimHalpert\neverybody: 2020 is f...,jim halpert @jimhalpert everybody finally go y...,jim halpert jimhalpert everybody 2020 finally ...
3009,covid_memes_5418,covid_memes_5418.png,[not harmful],litquidity\nelihcapital\nyofollewine\n*covid 1...,litquidity elihcapital yofollewine covid sympt...,litquidity elihcapital yofollewine covid 19 sy...
3010,covid_memes_5419,covid_memes_5419.png,[not harmful],meta\nMe sending my dog out for supplies since...,meta send dog supply contract covid-19 coc ma ...,meta sending dog supply since contract coc 100...
3011,covid_memes_5420,covid_memes_5420.png,[not harmful],People born in March/April in the\nboveteojoe ...,people bear march april boveteojoe folee come ...,people born boveteojoe foleing coming week soy...


In [76]:
data['binary_labels'] = np.where(data['labels'].apply(lambda x: 'very harmful' in x), 1, 0)

In [77]:
data.to_csv("data_preprocessed.csv")