In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline,linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import keras
import re
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


Using TensorFlow backend.


In [5]:
from nltk import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
PATH=Path("/content/drive/My Drive/emnlp/")

In [11]:
os.listdir(PATH)

['datasets', 'Data_Prepare.ipynb', 'traindf.csv']

In [32]:
df=pd.read_csv(PATH/'traindf.csv')

df=df.dropna() ## every alternate line is blank line

label_mapping={'propaganda':0,'non-propaganda':1} ## label encoding
df['labels']=df['labels'].apply(lambda x:label_mapping[x])

df.head()

Unnamed: 0.1,Unnamed: 0,sentence,labels
0,0,US bloggers banned from entering UK,1
2,2,Two prominent US bloggers have been banned fro...,1
4,4,Pamela Geller and Robert Spencer co-founded an...,0
6,6,They were due to speak at an English Defence L...,1
8,8,A government spokesman said individuals whose ...,1


In [0]:
ds,labels=df['sentence'],df['labels']

In [37]:
np.unique(labels,return_counts=True)

(array([0, 1]), array([ 4720, 11577]))

In [34]:
X_train,X_test,y_train,y_test=train_test_split(ds,labels,test_size=0.2,random_state=42,)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((13037,), (3260,), (13037,), (3260,))

In [0]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)

In [36]:
print('===========================   Naive Bayes =====================')


vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

#Naive Bayes for all features:
for vectorizer in vectorizers:
  print(vectorizer[1])
  clf = Pipeline([
    ('vect',vectorizer[0]),
    ('clf', MultinomialNB()),
  ])
  clf = clf.fit(X_train, y_train)
  predictions=clf.predict_proba(X_test)
  test_preds=np.argmax(predictions,axis=1)

  auc = classification_report(y_test, test_preds)
  print (auc)


count_vectorizer
              precision    recall  f1-score   support

           0       0.56      0.49      0.52       934
           1       0.81      0.84      0.82      2326

    accuracy                           0.74      3260
   macro avg       0.68      0.67      0.67      3260
weighted avg       0.74      0.74      0.74      3260

tfidf_vectorizer_word
              precision    recall  f1-score   support

           0       0.69      0.16      0.26       934
           1       0.74      0.97      0.84      2326

    accuracy                           0.74      3260
   macro avg       0.72      0.57      0.55      3260
weighted avg       0.73      0.74      0.68      3260

tfidf_vectorizer_word_ngram
              precision    recall  f1-score   support

           0       0.54      0.20      0.29       934
           1       0.74      0.93      0.83      2326

    accuracy                           0.72      3260
   macro avg       0.64      0.57      0.56      3260
weighte

In [38]:
print('===========================   Logistic Regression =====================')


vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

for vectorizer in vectorizers:
  print(vectorizer[1])
  clf = Pipeline([
    ('vect',vectorizer[0]),
    ('clf', linear_model.LogisticRegression(multi_class='auto',solver='lbfgs')),
  ])
  clf = clf.fit(X_train, y_train)
  predictions=clf.predict_proba(X_test)
  test_preds=np.argmax(predictions,axis=1)

  auc = classification_report(y_test, test_preds)
  print (auc)

count_vectorizer




              precision    recall  f1-score   support

           0       0.55      0.38      0.45       934
           1       0.78      0.87      0.82      2326

    accuracy                           0.73      3260
   macro avg       0.66      0.63      0.64      3260
weighted avg       0.71      0.73      0.72      3260

tfidf_vectorizer_word
              precision    recall  f1-score   support

           0       0.63      0.26      0.37       934
           1       0.76      0.94      0.84      2326

    accuracy                           0.74      3260
   macro avg       0.70      0.60      0.60      3260
weighted avg       0.72      0.74      0.70      3260

tfidf_vectorizer_word_ngram
              precision    recall  f1-score   support

           0       0.55      0.15      0.24       934
           1       0.74      0.95      0.83      2326

    accuracy                           0.72      3260
   macro avg       0.64      0.55      0.53      3260
weighted avg       0.68 

In [39]:
print('===========================   SVM =====================')


vectorizers=[(count_vect,'count_vectorizer'),(tfidf_vect,'tfidf_vectorizer_word'),(tfidf_vect_ngram,'tfidf_vectorizer_word_ngram'),(tfidf_vect_ngram_chars,'tfidf_vectorizer_ngram_chars')]

# SVM on count vectors: SVM Classifier Pipeline on word count vector
for vectorizer in vectorizers:
  print(vectorizer[1])
  clf = Pipeline([
    ('vect',vectorizer[0]),
    ('clf', SVC(gamma='scale',probability=True)),
  ])
  clf = clf.fit(X_train, y_train)
  predictions=clf.predict_proba(X_test)
  test_preds=np.argmax(predictions,axis=1)

  auc = classification_report(y_test, test_preds)
  print (auc)

count_vectorizer
              precision    recall  f1-score   support

           0       0.60      0.29      0.39       934
           1       0.76      0.92      0.84      2326

    accuracy                           0.74      3260
   macro avg       0.68      0.61      0.61      3260
weighted avg       0.72      0.74      0.71      3260

tfidf_vectorizer_word
              precision    recall  f1-score   support

           0       0.63      0.36      0.46       934
           1       0.78      0.91      0.84      2326

    accuracy                           0.76      3260
   macro avg       0.70      0.64      0.65      3260
weighted avg       0.74      0.76      0.73      3260

tfidf_vectorizer_word_ngram
              precision    recall  f1-score   support

           0       0.56      0.20      0.30       934
           1       0.74      0.94      0.83      2326

    accuracy                           0.73      3260
   macro avg       0.65      0.57      0.56      3260
weighte