In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
df = pd.read_csv('/content/drive/MyDrive/PR2/true-fake-news-processed.csv')

In [None]:
df.head()

Unnamed: 0,text,label
0,donald trump ha white house republican control...,0
1,sick tired hearing donald trump whine fake new...,0
2,secret gop brass le thrilled donald trump pres...,0
3,glenn beck man described forbes someone manage...,0
4,former fbi agent navy seal jonathan gilliam sa...,0


In [None]:
df.shape

(34330, 2)

In [None]:
df.tail()

Unnamed: 0,text,label
34325,mexico city reuters mexico wa pitched deep unc...,1
34326,washington reuters mexican finance minister jo...,1
34327,united nation reuters united nation security c...,1
34328,washington reuters president donald trump said...,1
34329,riyadh reuters oh arab oh muslim slaughtered o...,1


In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
nltk.data.path.append("/content/drive/MyDrive/PR2/nltk_data/")
def process_text(text):
    text = re.sub(r'\s+', ' ', text, flags=re.I) # Remove extra white space from text

    text = re.sub(r'\W', ' ', str(text)) # Remove all the special characters from text

    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Remove all single characters from text

    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove any character that isn't alphabetical

    text = text.lower()

    words = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    stop_words = set(stopwords.words("english"))
    Words = [word for word in words if word not in stop_words]

    cleaned_text = ' '.join(Words)

    return cleaned_text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

class TfIdf:

    def __init__(self,name, max_features=1000,min_df=1,max_df=1000000):
        self.max_features = max_features
        self.min_df = min_df
        self.max_df = max_df
        self.name = name
        self. vectorizer = TfidfVectorizer(max_features=max_features,min_df=min_df,max_df=max_df)
        self.train_vectors = None
        self.test_vectors = None

    def fit_transform(self,text_data):
        tfidf_vectors = self.vectorizer.fit_transform(text_data)
        pickle.dump(self.vectorizer, open(f"/content/drive/MyDrive/PR2/tfidf_{self.name}.pkl", 'wb'))
        return tfidf_vectors.toarray().tolist()

    def transform(self,text_data,force_fit=False):
        tfidf_vectors = self.vectorizer.transform(text_data)
        return tfidf_vectors.toarray().tolist()

    def set_train_vectors(self,train_vectors):
        self.train_vectors = train_vectors

    def set_test_vectors(self,test_vectors):
        self.test_vectors = test_vectors

    def get_train_vectors(self):
        if not self.train_vectors:
            raise ValueError("Train vectors not set. Call set_train_vectors() first.")
        return self.train_vectors

    def get_test_vectors(self):
        if not self.test_vectors:
            raise ValueError("Test vectors not set. Call set_test_vectors() first.")
        return self.test_vectors


In [None]:
import gensim
import numpy as np

class myword2vec:

    def __init__(self,name,window_size=10,word_min_count=1,vector_size=200):
      self.window_size = window_size
      self.word_min_count = word_min_count
      self.vector_size = vector_size
      self.name = name
      self.word2vecmodel = gensim.models.Word2Vec(
          window = window_size,
          min_count = word_min_count,
          vector_size = vector_size)
      self.train_vectors = None
      self.test_vectors = None

    def make_corpus_iterable(self,text_data):
      corpus_iterable =[]
      for text in text_data:
        vector = gensim.utils.simple_preprocess(text)
        corpus_iterable.append(vector)
      return corpus_iterable

    def fit_transform(self,text_data):
        corpus_iterable = self.make_corpus_iterable(text_data)
        #build vocabulary and train word2vec model
        self.word2vecmodel.build_vocab(corpus_iterable)
        self.word2vecmodel.train(corpus_iterable,
                        total_examples=self.word2vecmodel.corpus_count,
                        epochs = self.word2vecmodel.epochs)
        pickle.dump(self.word2vecmodel, open(f'/content/drive/MyDrive/PR2/word2vec_{self.name}.pkl', 'wb'))


        #replace each doc with a vector calculated as mean of all words vectors in the doc
        vectors=[]
        for text in corpus_iterable:
          vectors.append(self.word2vecmodel.wv.get_mean_vector(text))

        #change the diminsions of the vectors array to be suitable for training functions
        vectors_2d = np.stack(vectors)
        return vectors_2d

    def transform(self,text_data,force_fit=False):
        corpus_iterable = self.make_corpus_iterable(text_data)
        #replace each doc with a vector calculated as mean of all words vectors in the doc
        vectors=[]
        for text in corpus_iterable:
          vectors.append(self.word2vecmodel.wv.get_mean_vector(text))

        #change the diminsions of the vectors array to be suitable for training functions
        vectors_2d = np.stack(vectors)

        return vectors_2d

    def set_train_vectors(self,train_vectors):
        self.train_vectors = train_vectors

    def set_test_vectors(self,test_vectors):
        self.test_vectors = test_vectors

    def get_train_vectors(self):
        return self.train_vectors

    def get_test_vectors(self):
        return self.test_vectors



In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report
class PA:
  def __init__(self,name,max_iter=1000):
    self.max_iter = max_iter
    self.name = name
    self.model = PassiveAggressiveClassifier(max_iter=max_iter)

  def fit(self,X,y):
    self.model.fit(X,y)
    pickle.dump(self.model, open(f'/content/drive/MyDrive/PR2/PA_{self.name}.pkl', 'wb'))

  def predict(self,X):
    return self.model.predict(X)

  def report(self,X_test,y_test):
    return classification_report(y_test,self.predict(X_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
class RF:
  def __init__(self,name,n_estimators=100):
    self.n_estimators = n_estimators
    self.name = name
    self.model = RandomForestClassifier(n_estimators=n_estimators)

  def fit(self,X,y):
    self.model.fit(X,y)
    pickle.dump(self.model, open(f'/content/drive/MyDrive/PR2/RF_{self.name}.pkl', 'wb'))

  def predict(self,X):
    return self.model.predict(X)


  def report(self,X_test,y_test):
    return classification_report(y_test,self.predict(X_test))

In [None]:
from sklearn.svm import SVC
class SVM:
  def __init__(self,name,C=1.0,kernel='rbf'):
    self.C = C
    self.kernel = kernel
    self.name = name
    self.model = SVC(C=C,kernel=kernel)

  def fit(self,X,y):
    self.model.fit(X,y)
    pickle.dump(self.model, open(f'/content/drive/MyDrive/PR2/SVM_{self.name}.pkl', 'wb'))

  def predict(self,X):
       return self.model.predict(X)

  def report(self,X_test,y_test):
    return classification_report(y_test,self.predict(X_test))

In [None]:
from keras.models import Sequential
from keras.layers import Dense , LSTM ,Input
import tensorflow as tf
import numpy as np
np.random.seed(42)
tf.random.set_seed(42)

class my_LSTM:
  def __init__(self,name,units = 128,epochs = 10,batch_size = 128):
    self.units = units
    self.epochs = epochs
    self.batch_size = batch_size
    self.name = name

  def build(self,X_train_vector,y_train,X_test_vector,y_test,Input_shape=1000):
    model = Sequential()
    model.add(LSTM(units = self.units , input_shape = (Input_shape,1) ))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X_train_vector, y_train, epochs=self.epochs, batch_size=self.batch_size,validation_data=(X_test_vector,y_test),verbose=0)
    model.save(f'/content/drive/MyDrive/PR2/LSTM_{self.name}.h5')
    return history.history['val_accuracy'][-1]

  def predict(self,X_vector):
    model = tf.keras.models.load_model(f'/content/drive/MyDrive/PR2/LSTM_{self.name}.h5')
    predict = model.predict(X_vector)
    res = 1 if predict[0] > 0.5 else 0
    return res


In [None]:
def set_vectorizers(X_train,X_test,vectorizer):
  train_v = vectorizer.fit_transform(X_train)
  test_v = vectorizer.transform(X_test)
  vectorizer.set_train_vectors(train_v)
  vectorizer.set_test_vectors(test_v)
  return

In [None]:
def train_predict_score(X_train,y_train,X_test,y_test,model,vectorizer):
  X_train = vectorizer.get_train_vectors()
  X_test = vectorizer.get_test_vectors()
  model.fit(X_train,y_train)
  score = model.report(X_test,y_test)
  return score

In [None]:
def predict(X,model,vectorizer):
  X=process_text(X)
  X = vectorizer.transform(X)
  return model.predict(X)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

X = df['text']
y = df['label']

# Stratify the split based on the labels to ensure equal representation
X, y = shuffle(X, y, random_state=42)  # Shuffle data before splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
v1_tfidf = TfIdf('v1')
v1_word2vec = myword2vec('v1')

In [None]:
set_vectorizers(X_train,X_test,v1_tfidf)
set_vectorizers(X_train,X_test,v1_word2vec)

In [None]:
v1_PA = PA('v1')
v1_RF = RF('v1')
v1_SVM = SVM('v1')
v1_LSTM = my_LSTM('v1')

In [None]:
PA_tfidf_report = train_predict_score(X_train,y_train,X_test,y_test,v1_PA,v1_tfidf)
PA_word2vec_report = train_predict_score(X_train,y_train,X_test,y_test,v1_PA,v1_word2vec)

print("PA_tfidf_report" , ":" ,PA_tfidf_report)
print("PA_word2vec_report" ,":",PA_word2vec_report)

In [None]:
RF_tfidf_report =  train_predict_score(X_train,y_train,X_test,y_test,v1_RF,v1_tfidf)
RF_word2vec_report = train_predict_score(X_train,y_train,X_test,y_test,v1_RF,v1_word2vec)

print("RF_tfidf_report" , ":" ,RF_tfidf_report)
print("RF_word2vec_report" ,":",RF_word2vec_report)

In [None]:
SVM_tfidf_report = train_predict_score(X_train,y_train,X_test,y_test,v1_SVM,v1_tfidf)
SVM_word2vec_report = train_predict_score(X_train,y_train,X_test,y_test,v1_SVM,v1_word2vec)
print("SVM_tfidf_report" , ":" ,SVM_tfidf_report)
print("SVM_word2vec_report" ,":",SVM_word2vec_report)

In [None]:
LSTM_tfidf_report = v1_LSTM.build(X_train_vector = v1_tfidf.get_train_vectors(),y_train = y_train,X_test_vector=v1_tfidf.get_test_vectors(),y_test=y_test,Input_shape=1000)
LSTM_word2vec_report = v1_LSTM.build(X_train_vector = v1_word2vec.get_train_vectors(),y_train = y_train,X_test_vector=v1_word2vec.get_test_vectors(),y_test=y_test,Input_shape=200)

print("LSTM_tfidf_report" , ":" ,LSTM_tfidf_report)
print("LSTM_word2vec_report" ,":",LSTM_word2vec_report)