In [43]:
#import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re


#load datasets
train_df = pd.read_csv('propaganda_train.tsv',sep = '\t')
test_df = pd.read_csv('propaganda_val.tsv',sep = '\t')
train_df.head()

Unnamed: 0,label,tagged_in_context
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed."
1,not_propaganda,This declassification effort <BOS> won’t make ...
2,flag_waving,The Obama administration misled the <BOS> Amer...
3,not_propaganda,“It looks like we’re capturing the demise of t...
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>"


In [44]:
test_df.head()

Unnamed: 0,label,tagged_in_context
0,not_propaganda,"On average, between 300 and 600 infections are..."
1,causal_oversimplification,Mostly because <BOS> the country would not las...
2,appeal_to_fear_prejudice,Lyndon Johnson <BOS> gets Earl Warren and Sen....
3,not_propaganda,<BOS> You <EOS> may opt out at anytime.
4,repetition,It must be exacted from him directly in order ...


In [45]:
#make a list of the propaganda
propaganda = ['flag_waving','loaded_language','doubt','name_calling,labeling','appeal_to_fear_prejudice','repetition', 'causal_oversimplification','exaggeration,minimisation']
#replace the list items in the column with propaganda
test_df['label'] = test_df['label'].apply(lambda x: 'propaganda' if x in propaganda else x)
test_df.head()

Unnamed: 0,label,tagged_in_context
0,not_propaganda,"On average, between 300 and 600 infections are..."
1,propaganda,Mostly because <BOS> the country would not las...
2,propaganda,Lyndon Johnson <BOS> gets Earl Warren and Sen....
3,not_propaganda,<BOS> You <EOS> may opt out at anytime.
4,propaganda,It must be exacted from him directly in order ...


In [46]:
#remove <BOS> and  from the dataframe
def remove(word):
    return word.replace('<BOS>','').replace('<EOS>','')
test_df['tagged_in_context2'] = test_df['tagged_in_context'].apply(remove)
test_df.head()

Unnamed: 0,label,tagged_in_context,tagged_in_context2
0,not_propaganda,"On average, between 300 and 600 infections are...","On average, between 300 and 600 infections are..."
1,propaganda,Mostly because <BOS> the country would not las...,Mostly because the country would not last lon...
2,propaganda,Lyndon Johnson <BOS> gets Earl Warren and Sen....,Lyndon Johnson gets Earl Warren and Sen. Rich...
3,not_propaganda,<BOS> You <EOS> may opt out at anytime.,You may opt out at anytime.
4,propaganda,It must be exacted from him directly in order ...,It must be exacted from him directly in order ...


In [47]:
#encoding
test_df['label'] = test_df['label'].astype('category')
test_df['label'] = test_df['label'].cat.codes
test_df.head()

Unnamed: 0,label,tagged_in_context,tagged_in_context2
0,0,"On average, between 300 and 600 infections are...","On average, between 300 and 600 infections are..."
1,1,Mostly because <BOS> the country would not las...,Mostly because the country would not last lon...
2,1,Lyndon Johnson <BOS> gets Earl Warren and Sen....,Lyndon Johnson gets Earl Warren and Sen. Rich...
3,0,<BOS> You <EOS> may opt out at anytime.,You may opt out at anytime.
4,1,It must be exacted from him directly in order ...,It must be exacted from him directly in order ...


In [48]:
train_df['label'].unique() 

array(['not_propaganda', 'flag_waving', 'loaded_language', 'doubt',
       'name_calling,labeling', 'appeal_to_fear_prejudice', 'repetition',
       'causal_oversimplification', 'exaggeration,minimisation'],
      dtype=object)

In [49]:
#make a list of the propaganda
propaganda = ['flag_waving','loaded_language','doubt','name_calling,labeling','appeal_to_fear_prejudice','repetition', 'causal_oversimplification','exaggeration,minimisation']
#replace the list items in the column with propaganda
train_df['label'] = train_df['label'].apply(lambda x: 'propaganda' if x in propaganda else x)
train_df.head()

Unnamed: 0,label,tagged_in_context
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed."
1,not_propaganda,This declassification effort <BOS> won’t make ...
2,propaganda,The Obama administration misled the <BOS> Amer...
3,not_propaganda,“It looks like we’re capturing the demise of t...
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>"


In [50]:
#remove <BOS> and  from the dataframe
def remove(word):
    return word.replace('<BOS>','').replace('<EOS>','')
train_df['tagged_in_context2'] = train_df['tagged_in_context'].apply(remove)
train_df.head()

Unnamed: 0,label,tagged_in_context,tagged_in_context2
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed.","No, he will not be confirmed."
1,not_propaganda,This declassification effort <BOS> won’t make ...,This declassification effort won’t make thing...
2,propaganda,The Obama administration misled the <BOS> Amer...,The Obama administration misled the American ...
3,not_propaganda,“It looks like we’re capturing the demise of t...,“It looks like we’re capturing the demise of t...
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>","Location: Westerville, Ohio"


In [51]:
#encoding
train_df['label'] = train_df['label'].astype('category')
train_df['label'] = train_df['label'].cat.codes
train_df.head()

Unnamed: 0,label,tagged_in_context,tagged_in_context2
0,0,"No, <BOS> he <EOS> will not be confirmed.","No, he will not be confirmed."
1,0,This declassification effort <BOS> won’t make ...,This declassification effort won’t make thing...
2,1,The Obama administration misled the <BOS> Amer...,The Obama administration misled the American ...
3,0,“It looks like we’re capturing the demise of t...,“It looks like we’re capturing the demise of t...
4,0,"<BOS> Location: Westerville, Ohio <EOS>","Location: Westerville, Ohio"


In [52]:
#removing special characters
import nltk
nltk.download('stopwords')

ps = PorterStemmer()
corpus = []
for i in range(0,len(train_df)):
    review = re.sub('[^a-zA-Z]',' ',train_df['tagged_in_context2'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /home/repl/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [53]:
#removing special characters
ps = PorterStemmer()
corpus1 = []
for i in range(0,len(test_df)):
    review = re.sub('[^a-zA-Z]',' ',test_df['tagged_in_context2'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus1.append(review)

In [54]:
#apply countvectorizer
#create bag of words
count_v = CountVectorizer(max_features=1000,ngram_range=(1,3))
x_train = count_v.fit_transform(corpus).toarray()
y_train = train_df['label']

x_test = count_v.fit_transform(corpus1).toarray()
y_test = test_df['label']

In [55]:
count_v.get_feature_names_out()[:20]

array(['abandon', 'absolut', 'abus', 'accept', 'access', 'access freedom',
       'access freedom outpost', 'accord', 'account', 'accus',
       'acknowledg', 'act', 'action', 'activ', 'actual', 'ad', 'adam',
       'address', 'administr', 'admit'], dtype=object)

In [56]:
#store features in a dataframe
count_df = pd.DataFrame(x_train,columns=count_v.get_feature_names_out())

In [57]:
#apply TfidfVectorizer 
tfidf_v = TfidfVectorizer(max_features=5000,ngram_range=(1,3))
x_train = tfidf_v.fit_transform(corpus).toarray()
x_test = tfidf_v.fit_transform(corpus1).toarray()
tfidf_v.get_feature_names_out()[:20]

array(['abandon', 'abid', 'abl', 'absolut', 'abus', 'academ',
       'academ council', 'accept', 'access', 'access freedom',
       'access freedom outpost', 'accord', 'account', 'accus',
       'accus kavanaugh', 'acknowledg', 'across', 'act', 'act pieti',
       'act pieti imposit'], dtype=object)

In [58]:
#Using MultinomialNB Algorithm for classification
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import itertools

m_classifier = MultinomialNB()
m_classifier.fit(x_train,y_train)
y_pred = m_classifier.predict(x_test)
score = metrics.accuracy_score(y_test,y_pred)
report = metrics.classification_report(y_test,y_pred)
print(f"accuracy: %0.3f %{score}")
report

accuracy: %0.3f %0.5155172413793103


'              precision    recall  f1-score   support\n\n           0       0.53      0.52      0.53       301\n           1       0.50      0.51      0.50       279\n\n    accuracy                           0.52       580\n   macro avg       0.52      0.52      0.52       580\nweighted avg       0.52      0.52      0.52       580\n'

In [59]:
#Multinomial Classifier with Hyperparameter
m_classifier = MultinomialNB(alpha=0.1)
previous_score = 0
for alpha in np.arange(0,1,0.1):
    mid_classifier = MultinomialNB(alpha=alpha)  # Fixing the error by passing alpha as a keyword argument
    mid_classifier.fit(x_train,y_train)
    y_pred = mid_classifier.predict(x_test)
    score = metrics.accuracy_score(y_test,y_pred)
    if score>previous_score:
        m_classifier = mid_classifier
    print("Alpha: {}, score: {}".format(alpha,score))  # Fixing the format method

Alpha: 0.0, score: 0.5189655172413793
Alpha: 0.1, score: 0.5206896551724138
Alpha: 0.2, score: 0.5189655172413793
Alpha: 0.30000000000000004, score: 0.5017241379310344
Alpha: 0.4, score: 0.5137931034482759
Alpha: 0.5, score: 0.5275862068965518
Alpha: 0.6000000000000001, score: 0.5258620689655172
Alpha: 0.7000000000000001, score: 0.5189655172413793
Alpha: 0.8, score: 0.5120689655172413
Alpha: 0.9, score: 0.5155172413793103


In [60]:
#Using a MLPClassifier
from sklearn.neural_network import MLPClassifier

mlp_c = MLPClassifier(hidden_layer_sizes=(100,),max_iter=500, random_state=42)
mlp_c.fit(x_train,y_train)
y_pred = mlp_c.predict(x_test)
score = metrics.accuracy_score(y_test,y_pred)
report = metrics.classification_report(y_test,y_pred)
print(f"accuracy: %0.3f %{score}")
report

accuracy: %0.3f %0.5327586206896552


'              precision    recall  f1-score   support\n\n           0       0.56      0.48      0.51       301\n           1       0.51      0.59      0.55       279\n\n    accuracy                           0.53       580\n   macro avg       0.54      0.54      0.53       580\nweighted avg       0.54      0.53      0.53       580\n'