<a href="https://colab.research.google.com/github/KohKeira/FakeOrReal/blob/main/News_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install nltk

# Loading NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


lem = WordNetLemmatizer()
stop_words=set(stopwords.words("english"))
punctuations="?:!.,;"

data = pd.read_csv("news.csv", encoding = "latin-1")
data = data[['title', 'label']]
data = data.rename(columns = {'title':'text', "label": 'label'})

processed = []

# data pre-processing
def process_data(msg):
  msg = msg.lower()
  msg = word_tokenize(msg)
  filtered_msg = []
  for word in msg:
    if word not in punctuations and word not in stop_words:
      word = lem.lemmatize(word)
      filtered_msg.append(word)

  filtered_msg = " ".join(filtered_msg)

  return filtered_msg


data['text'] = data['text'].apply(process_data)

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size = 0.25, random_state=1)

tfidf = TfidfVectorizer(ngram_range=(1,3))
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)



In [None]:
# training the classifier 

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

#Grid Search
#from sklearn.model_selection import GridSearchCV
#alpha = [0.01, 0.1, 0.3, 0.5, 0.7, 1]
#param_grid = dict(alpha=alpha)
#grid = GridSearchCV(estimator=classifierNB, param_grid=param_grid)
#grid_result = grid.fit(X_train, y_train)
#print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

### STEP 4 - Prediction
### Create and run Classifier

classifierNB = MultinomialNB(alpha=0.3)


### Fitting requires training TF_IDF vectors and labels
classifierNB.fit(X_train, y_train)

pred=classifierNB.predict(X_test)

print("Confusion matrix: \n", confusion_matrix(y_test, pred))

score = accuracy_score(y_test,pred)
print("\nAccuracy Score:", score)

matrix = classification_report(y_test,pred)
print("\nClassfication Report: \n", matrix)

Confusion matrix: 
 [[1144   66]
 [  54 1236]]

Accuracy Score: 0.952

Classfication Report: 
               precision    recall  f1-score   support

        FAKE       0.95      0.95      0.95      1210
        TRUE       0.95      0.96      0.95      1290

    accuracy                           0.95      2500
   macro avg       0.95      0.95      0.95      2500
weighted avg       0.95      0.95      0.95      2500



In [None]:
#test against new messages
def pred(msg):
  msg = tfidf.transform([msg])
  prediction = classifierNB.predict(msg)
  return prediction[0]

print(pred("Poland to allocate additional $55 bllion on defense by 2032: deputy minister"))
print(pred("Arizona Rancher Protesting in Oregon is Targeted by CPS, Loses Custody of Foster Children"))

TRUE
FAKE
