In [1]:
import pandas as pd
import json
import spacy
import numpy as np
nlp = spacy.load("en_core_web_sm")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def preprocess(raw_text):
    doc = nlp(raw_text)
    tokens = []
    for token in doc: 
        if not any([token.is_space, token.is_stop, token.is_punct, 
                    token.like_num, token.like_url]):
            tokens.append(token.lemma_.lower())
    return tokens

# Function for text preprocessing
def preprocess_text(text):
    
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords and non-alphanumeric characters
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Train

In [3]:
data=[]
with open('memes/defaults/annotations/train.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    data.append(result)
    #print(f"result: {result}")
    #print(isinstance(result, dict))
    
data = pd.DataFrame(data)
data

Unnamed: 0,id,image,labels,text
0,covid_memes_18,covid_memes_18.png,"[somewhat harmful, individual]",Bernie or Elizabeth?\nBe informed.Compare them...
1,covid_memes_19,covid_memes_19.png,"[somewhat harmful, organization]",Extending the\nBrexit deadline until\nOctober ...
2,covid_memes_252,covid_memes_252.png,[not harmful],kwai\ngkwa 0964\n#nnevvy\napplause to Thais fr...
3,covid_memes_255,covid_memes_255.png,[not harmful],"So, I order this\nfoce mask to\nprotect ogains..."
4,covid_memes_20,covid_memes_20.png,"[somewhat harmful, individual]",best candidate for\nJA\n2020\njoe biden\nKamal...
...,...,...,...,...
3008,covid_memes_5417,covid_memes_5417.png,[not harmful],Jim Halpert\n@JimHalpert\neverybody: 2020 is f...
3009,covid_memes_5418,covid_memes_5418.png,[not harmful],litquidity\nelihcapital\nyofollewine\n*covid 1...
3010,covid_memes_5419,covid_memes_5419.png,[not harmful],meta\nMe sending my dog out for supplies since...
3011,covid_memes_5420,covid_memes_5420.png,[not harmful],People born in March/April in the\nboveteojoe ...


In [4]:
data[["labels"]].drop_duplicates()

Unnamed: 0,labels
0,"[somewhat harmful, individual]"
1,"[somewhat harmful, organization]"
2,[not harmful]
5,"[very harmful, society]"
16,"[somewhat harmful, community]"
68,"[very harmful, community]"
84,"[somewhat harmful, society]"
99,"[very harmful, individual]"
325,"[very harmful, organization]"


In [5]:
data["processed_text"] = data["text"].apply(preprocess).apply(lambda x: " ".join(x))
data["processed_text_alt"] = data['text'].apply(preprocess_text)

data


Unnamed: 0,id,image,labels,text,processed_text,processed_text_alt
0,covid_memes_18,covid_memes_18.png,"[somewhat harmful, individual]",Bernie or Elizabeth?\nBe informed.Compare them...,bernie elizabeth inform compare issue matter i...,bernie elizabeth issue matter issue make danke...
1,covid_memes_19,covid_memes_19.png,"[somewhat harmful, organization]",Extending the\nBrexit deadline until\nOctober ...,extend brexit deadline october order ensure de...,extending brexit deadline october 31st order e...
2,covid_memes_252,covid_memes_252.png,[not harmful],kwai\ngkwa 0964\n#nnevvy\napplause to Thais fr...,kwai gkwa nnevvy applause thais hong kong thai...,kwai gkwa 0964 nnevvy applause thai hong kong ...
3,covid_memes_255,covid_memes_255.png,[not harmful],"So, I order this\nfoce mask to\nprotect ogains...",order foce mask protect ogainst fhe corond vir...,order foce mask protect ogainst fhe corond vir...
4,covid_memes_20,covid_memes_20.png,"[somewhat harmful, individual]",best candidate for\nJA\n2020\njoe biden\nKamal...,good candidate ja joe biden kamala harris bern...,best candidate ja 2020 joe biden kamala harris...
...,...,...,...,...,...,...
3008,covid_memes_5417,covid_memes_5417.png,[not harmful],Jim Halpert\n@JimHalpert\neverybody: 2020 is f...,jim halpert @jimhalpert everybody finally go y...,jim halpert jimhalpert everybody 2020 finally ...
3009,covid_memes_5418,covid_memes_5418.png,[not harmful],litquidity\nelihcapital\nyofollewine\n*covid 1...,litquidity elihcapital yofollewine covid sympt...,litquidity elihcapital yofollewine covid 19 sy...
3010,covid_memes_5419,covid_memes_5419.png,[not harmful],meta\nMe sending my dog out for supplies since...,meta send dog supply contract covid-19 coc ma ...,meta sending dog supply since contract coc 100...
3011,covid_memes_5420,covid_memes_5420.png,[not harmful],People born in March/April in the\nboveteojoe ...,people bear march april boveteojoe folee come ...,people born boveteojoe foleing coming week soy...


In [6]:
data['binary_labels'] = np.where(data['labels'].apply(lambda x: 'not harmful' in x), 0, 1)

In [7]:
# Fill NaN values with an empty string
data['text'] = data['text'].fillna('.')
data['processed_text_alt'] = data['processed_text_alt'].fillna('.')
data['processed_text'] = data['processed_text'].fillna('.')

In [8]:
data.to_csv("data_train_preprocessed.csv")

# Test

In [9]:
data=[]
with open('memes/defaults/annotations/test.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    data.append(result)
    #print(f"result: {result}")
    #print(isinstance(result, dict))
    
data = pd.DataFrame(data)
data

Unnamed: 0,id,image,labels,text
0,covid_memes_5425,covid_memes_5425.png,[not harmful],gwen\n@gwenervi\ndis gon be trump tomorrow aft...
1,covid_memes_5426,covid_memes_5426.png,[not harmful],Armani\n@historyofarmani\nBiden after hearing ...
2,covid_memes_5429,covid_memes_5429.png,[not harmful],MESSAGE FROM TRUMP TO\nCOVID-19\nLEAVE NOW OR ...
3,covid_memes_5430,covid_memes_5430.png,[not harmful],COVID-19 STARTED DURING HIS TERM\nSOIT SHOULD ...
4,covid_memes_5434,covid_memes_5434.png,[not harmful],TRUMPS RESPONSE TO COVID-19\nUMP\nTAYM\nINCLUD...
...,...,...,...,...
349,covid_memes_2041,covid_memes_2041.png,[not harmful],ONE DAY IN THE FUTURE\n
350,covid_memes_2049,covid_memes_2049.png,[not harmful],IndieWire\nIndieWire\n@quentquarantino\n@Indie...
351,covid_memes_2058,covid_memes_2058.png,[not harmful],450/4 steady batting still have\nanother 10 da...
352,covid_memes_2062,covid_memes_2062.png,[not harmful],1\n200\n20\nMemes on Baba Ramdev's anti Covid-...


In [10]:
data["processed_text"] = data["text"].apply(preprocess).apply(lambda x: " ".join(x))
data["processed_text_alt"] = data['text'].apply(preprocess_text)
data


Unnamed: 0,id,image,labels,text,processed_text,processed_text_alt
0,covid_memes_5425,covid_memes_5425.png,[not harmful],gwen\n@gwenervi\ndis gon be trump tomorrow aft...,gwen @gwenervi dis gon trump tomorrow inject e...,gwen gwenervi dis gon trump tomorrow inject ex...
1,covid_memes_5426,covid_memes_5426.png,[not harmful],Armani\n@historyofarmani\nBiden after hearing ...,armani @historyofarmani biden hear trump get c...,armani historyofarmani biden hearing trump got...
2,covid_memes_5429,covid_memes_5429.png,[not harmful],MESSAGE FROM TRUMP TO\nCOVID-19\nLEAVE NOW OR ...,message trump covid-19 leave deport,message trump leave deport
3,covid_memes_5430,covid_memes_5430.png,[not harmful],COVID-19 STARTED DURING HIS TERM\nSOIT SHOULD ...,covid-19 start term soit know trump pandemic,started term soit known trump pandemic
4,covid_memes_5434,covid_memes_5434.png,[not harmful],TRUMPS RESPONSE TO COVID-19\nUMP\nTAYM\nINCLUD...,trumps response covid-19 ump taym includes bor...,trump response ump taym includes borrowing mon...
...,...,...,...,...,...,...
349,covid_memes_2041,covid_memes_2041.png,[not harmful],ONE DAY IN THE FUTURE\n,day future,one day future
350,covid_memes_2049,covid_memes_2049.png,[not harmful],IndieWire\nIndieWire\n@quentquarantino\n@Indie...,indiewire indiewire @quentquarantino @indiewir...,indiewire indiewire quentquarantino indiewire ...
351,covid_memes_2058,covid_memes_2058.png,[not harmful],450/4 steady batting still have\nanother 10 da...,steady batting day leave mrs bored reckon bowl...,steady batting still another 10 day left mr bo...
352,covid_memes_2062,covid_memes_2062.png,[not harmful],1\n200\n20\nMemes on Baba Ramdev's anti Covid-...,memes baba ramdev anti covid-19 drug coronil i...,1 200 20 meme baba ramdev anti drug take inter...


In [11]:
data['binary_labels'] = np.where(data['labels'].apply(lambda x: 'not harmful' in x), 0, 1)

In [12]:
# Fill NaN values with an empty string
data['text'] = data['text'].fillna('.')
data['processed_text_alt'] = data['processed_text_alt'].fillna('.')
data['processed_text'] = data['processed_text'].fillna('.')

In [13]:
data.to_csv("data_test_preprocessed.csv")

# Val

In [14]:
data=[]
with open('memes/defaults/annotations/val.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    data.append(result)
    #print(f"result: {result}")
    #print(isinstance(result, dict))
    
data = pd.DataFrame(data)
data

Unnamed: 0,id,image,labels,text
0,covid_memes_2069,covid_memes_2069.png,[not harmful],Coronavirus\nVaccine\nInjection only\n5 ml Sto...
1,covid_memes_2075,covid_memes_2075.png,[not harmful],THE NEW\nINDIAN EXPRESS\nRussia has developed ...
2,covid_memes_2080,covid_memes_2080.png,[not harmful],Ssrfan @Ssrfan478780364 - 2h\nThe owners of Zo...
3,covid_memes_2108,covid_memes_2108.png,[not harmful],Alexa Simone thatssokrahe 14h\nMy coworker sto...
4,covid_memes_2117,covid_memes_2117.png,[not harmful],Mele Gdl\n*s the sonual tension between Plizer...
...,...,...,...,...
172,covid_memes_219,covid_memes_219.png,[not harmful],*New Swine flu like virus with\nhuman pandemic...
173,covid_memes_224,covid_memes_224.png,[not harmful],"CoronaVirus, 29\no Wuhan University\ne Lives i..."
174,covid_memes_225,covid_memes_225.png,[not harmful],You can have any virus\nyou want\nas long as i...
175,covid_memes_235,covid_memes_235.png,[not harmful],The Corona Virus\nwon't last long\nbecause it ...


In [15]:
data["processed_text"] = data["text"].apply(preprocess).apply(lambda x: " ".join(x))
data["processed_text_alt"] = data['text'].apply(preprocess_text)
data


Unnamed: 0,id,image,labels,text,processed_text,processed_text_alt
0,covid_memes_2069,covid_memes_2069.png,[not harmful],Coronavirus\nVaccine\nInjection only\n5 ml Sto...,coronavirus vaccine injection ml store fr covi...,coronavirus vaccine injection 5 ml store fr va...
1,covid_memes_2075,covid_memes_2075.png,[not harmful],THE NEW\nINDIAN EXPRESS\nRussia has developed ...,new indian express russia develop coronaviru v...,new indian express russia developed coronaviru...
2,covid_memes_2080,covid_memes_2080.png,[not harmful],Ssrfan @Ssrfan478780364 - 2h\nThe owners of Zo...,ssrfan @ssrfan478780364 2h owner zoom read cov...,ssrfan ssrfan478780364 2h owner zoom reading m...
3,covid_memes_2108,covid_memes_2108.png,[not harmful],Alexa Simone thatssokrahe 14h\nMy coworker sto...,alexa simone thatssokrahe 14h coworker steal c...,alexa simone thatssokrahe 14h coworker stole c...
4,covid_memes_2117,covid_memes_2117.png,[not harmful],Mele Gdl\n*s the sonual tension between Plizer...,mele gdl s sonual tension plizer moderna prap,mele gdl sonual tension plizer moderna prap
...,...,...,...,...,...,...
172,covid_memes_219,covid_memes_219.png,[not harmful],*New Swine flu like virus with\nhuman pandemic...,new swine flu like virus human pandemic potent...,new swine flu like virus human pandemic potent...
173,covid_memes_224,covid_memes_224.png,[not harmful],"CoronaVirus, 29\no Wuhan University\ne Lives i...",coronavirus o wuhan university e live wuhan mi...,coronavirus 29 wuhan university e life wuhan l...
174,covid_memes_225,covid_memes_225.png,[not harmful],You can have any virus\nyou want\nas long as i...,virus want long corona memedroid,virus want long corona memedroid
175,covid_memes_235,covid_memes_235.png,[not harmful],The Corona Virus\nwon't last long\nbecause it ...,corona virus will long china,corona virus wo last long made china


In [16]:
data['binary_labels'] = np.where(data['labels'].apply(lambda x: 'not harmful' in x), 0, 1)

In [17]:
# Fill NaN values with an empty string
data['text'] = data['text'].fillna('.')
data['processed_text_alt'] = data['processed_text_alt'].fillna('.')
data['processed_text'] = data['processed_text'].fillna('.')

In [18]:
data.to_csv("data_val_preprocessed.csv")