# Task 1

### 1. Bag-of-Words Classifier (Naive Bayes)

In [1]:
# Import necessary libraries
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# Load the data
train_data = pd.read_csv("propaganda_train.tsv", sep='\t')
test_data = pd.read_csv("propaganda_val.tsv", sep='\t')

#### Data Preprocsesing

In [3]:
# Training data preprocessing
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2414 entries, 0 to 2413
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   label              2414 non-null   object
 1   tagged_in_context  2414 non-null   object
dtypes: object(2)
memory usage: 37.8+ KB


In [4]:
# Get unique categories
train_data['label'].unique()

array(['not_propaganda', 'flag_waving', 'loaded_language', 'doubt',
       'name_calling,labeling', 'appeal_to_fear_prejudice', 'repetition',
       'causal_oversimplification', 'exaggeration,minimisation'],
      dtype=object)

In [5]:
# Map all categories to "propaganda" except "not_propaganda"
train_data['label'] = train_data['label'].apply(lambda x: 'propaganda' if x != 'not_propaganda' else x)

train_data['label'].unique()

array(['not_propaganda', 'propaganda'], dtype=object)

In [6]:
# Testing data preprocessing
test_data['label'].unique()

array(['not_propaganda', 'causal_oversimplification',
       'appeal_to_fear_prejudice', 'repetition', 'name_calling,labeling',
       'loaded_language', 'flag_waving', 'doubt',
       'exaggeration,minimisation'], dtype=object)

In [7]:
# Map all categories to "propaganda" except "not_propaganda"
test_data['label'] = test_data['label'].apply(lambda x: 'propaganda' if x != 'not_propaganda' else x)

test_data['label'].unique()

array(['not_propaganda', 'propaganda'], dtype=object)

In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580 entries, 0 to 579
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   label              580 non-null    object
 1   tagged_in_context  580 non-null    object
dtypes: object(2)
memory usage: 9.2+ KB


In [9]:
def preprocess(text):
    # Regular expression pattern to extract words between <EOS> and <BOS> tags
    pattern = re.compile(r'<BOS>(.?)<EOS>|<EOS>(.?)<BOS>', re.DOTALL)
    # Extracting words between <EOS> and <BOS> tags and joining them with a space
    extracted_text = ' '.join([match[0] if match[0] else match[1] for match in re.findall(pattern, text)])
    # Conversion to lower case
    extracted_text = extracted_text.lower()
    # Remove punctuation
    extracted_text = re.sub(r'[^\w\s]', '', extracted_text)
    return extracted_text.strip()



In [10]:
train_data['cleaned'] = train_data['tagged_in_context'].apply(preprocess)

#### Feature Extraction

In [11]:
# Feature Extraction
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data['tagged_in_context'])
y_train = train_data['label']

#### Classification Model

In [12]:
# Classifier Training
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

#### Evaluation

In [13]:
# Evaluation
X_test = vectorizer.transform(test_data['tagged_in_context'])
y_test = test_data['label']
predictions = classifier.predict(X_test)

In [14]:
# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7068965517241379


In [15]:
# Get Classification Report
print("Classification Report:")
print(classification_report(y_test, predictions))

Classification Report:
                precision    recall  f1-score   support

not_propaganda       0.75      0.66      0.70       301
    propaganda       0.67      0.76      0.71       279

      accuracy                           0.71       580
     macro avg       0.71      0.71      0.71       580
  weighted avg       0.71      0.71      0.71       580



IMPROVING MODEL FOR NAIVE BAYES


In [16]:

X_train = train_data['tagged_in_context']
y_train = train_data['label']
X_test = test_data['tagged_in_context']
y_test = test_data['label']

# Pipeline setup with CountVectorizer and MultinomialNB
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

# Parameters grid for tuning
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # testing unigrams and bigrams
    'clf__alpha': [1.0, 0.1, 0.01]  # different levels of smoothing
}

# Setting up GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Fetching the best model from the grid search
best_model = grid_search.best_estimator_

# Predictions and evaluation using the test data
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

# Printing out results
print("Test Accuracy:", accuracy)
print("Classification Report:\n", report)


Test Accuracy: 0.7189655172413794
Classification Report:
                 precision    recall  f1-score   support

not_propaganda       0.78      0.64      0.70       301
    propaganda       0.67      0.80      0.73       279

      accuracy                           0.72       580
     macro avg       0.73      0.72      0.72       580
  weighted avg       0.73      0.72      0.72       580



### Approach 2. Word2Vec

In [17]:
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [18]:
import gensim.downloader as api

In [18]:
wv = api.load('word2vec-google-news-300')

[=====---------------------------------------------] 10.5% 174.3/1662.8MB downloaded

In [1]:
# Compute document vector
def document_vector(text):
    vectors = [wv[word] for word in text.split() if word in wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(wv.vector_size)

In [20]:
import numpy as np
# Document Representation
X_train = np.array([document_vector(text) for text in train_data['tagged_in_context']])
y_train = train_data['label']

NameError: name 'train_data' is not defined

In [21]:
# Classifier Training
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

In [22]:
# Evaluation
X_test = np.array([document_vector(text) for text in test_data['tagged_in_context']])
y_test = test_data['label']
predictions = classifier.predict(X_test)

In [23]:
# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7137931034482758


In [24]:
# Get Classification Report
print("Classification Report:")
print(classification_report(y_test, predictions))

Classification Report:

                precision    recall  f1-score   support



not_propaganda       0.73      0.70      0.72       301

    propaganda       0.69      0.72      0.71       279



      accuracy                           0.71       580

     macro avg       0.71      0.71      0.71       580

  weighted avg       0.71      0.71      0.71       580




## Tasks 2
### 1. Bag-of-Words Classifier for Propaganda Detection

In [25]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [26]:
# Load and preprocess the data
train_data = pd.read_csv("propaganda_train.tsv", sep='\t')
test_data = pd.read_csv("propaganda_val.tsv", sep='\t')

In [27]:
train_data.head()

Unnamed: 0,label,tagged_in_context
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed."
1,not_propaganda,This declassification effort <BOS> won’t make ...
2,flag_waving,The Obama administration misled the <BOS> Amer...
3,not_propaganda,“It looks like we’re capturing the demise of t...
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>"


In [28]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 2414 entries, 0 to 2413

Data columns (total 2 columns):

 #   Column             Non-Null Count  Dtype 

---  ------             --------------  ----- 

 0   label              2414 non-null   object

 1   tagged_in_context  2414 non-null   object

dtypes: object(2)

memory usage: 37.8+ KB


In [29]:
test_data.head()

Unnamed: 0,label,tagged_in_context
0,not_propaganda,"On average, between 300 and 600 infections are..."
1,causal_oversimplification,Mostly because <BOS> the country would not las...
2,appeal_to_fear_prejudice,Lyndon Johnson <BOS> gets Earl Warren and Sen....
3,not_propaganda,<BOS> You <EOS> may opt out at anytime.
4,repetition,It must be exacted from him directly in order ...


In [30]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 580 entries, 0 to 579

Data columns (total 2 columns):

 #   Column             Non-Null Count  Dtype 

---  ------             --------------  ----- 

 0   label              580 non-null    object

 1   tagged_in_context  580 non-null    object

dtypes: object(2)

memory usage: 9.2+ KB


In [31]:
# Feature Extraction
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data['tagged_in_context'])
y_train = train_data['label']

In [32]:
# Classifier Training
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [33]:
# Evaluation
X_test = vectorizer.transform(test_data['tagged_in_context'])
y_test = test_data['label']
predictions = classifier.predict(X_test)

In [34]:
# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

NameError: name 'accuracy_score' is not defined

In [35]:
# Get Classification Report
print("Classification Report:")
print(classification_report(y_test, predictions))

Classification Report:

                           precision    recall  f1-score   support



 appeal_to_fear_prejudice       0.27      0.07      0.11        43

causal_oversimplification       0.33      0.10      0.15        31

                    doubt       0.60      0.08      0.14        38

exaggeration,minimisation       0.14      0.04      0.06        28

              flag_waving       0.75      0.31      0.44        39

          loaded_language       0.00      0.00      0.00        37

    name_calling,labeling       0.17      0.03      0.05        31

           not_propaganda       0.58      0.99      0.73       301

               repetition       0.50      0.12      0.20        32



                 accuracy                           0.56       580

                macro avg       0.37      0.19      0.21       580

             weighted avg       0.47      0.56      0.45       580




## 2. Word2Vec Propaganda Technique Classification

In [36]:
# Import libraries
import pandas as pd
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [37]:
# Compute document vector
def document_vector(text):
    vectors = [wv[word] for word in text.split() if word in wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(wv.vector_size)


In [38]:
# Document Representation
X_train = np.array([document_vector(text) for text in train_data['tagged_in_context']])
y_train = train_data['label']

In [39]:
# Classifier Training
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

In [40]:
# Evaluation
X_test = np.array([document_vector(text) for text in test_data['tagged_in_context']])
y_test = test_data['label']
predictions = classifier.predict(X_test)

In [41]:
# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.5310344827586206


In [42]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

Classification Report:

                           precision    recall  f1-score   support



 appeal_to_fear_prejudice       0.25      0.05      0.08        43

causal_oversimplification       0.00      0.00      0.00        31

                    doubt       0.00      0.00      0.00        38

exaggeration,minimisation       0.50      0.14      0.22        28

              flag_waving       0.57      0.21      0.30        39

          loaded_language       0.60      0.08      0.14        37

    name_calling,labeling       0.14      0.06      0.09        31

           not_propaganda       0.55      0.95      0.70       301

               repetition       0.50      0.09      0.16        32



                 accuracy                           0.53       580

                macro avg       0.35      0.18      0.19       580

             weighted avg       0.44      0.53      0.42       580




In [None]:
#Hyperparametric tuning for logistic regression

classifier = LogisticRegression(random_state=1)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid_params = {'solver':'solvers','penalty':penalty,'C':c_values}
grid_search = GridSearchCV(estimator=model, param_grid=grid_params, n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))