# Step 1 === Data 

- labeling 
- merging 
- shiffling 

In [8]:
import pandas as pd

In [9]:
data_fake = pd.read_csv('/home/hooria.najeeb@vaival.tech/miniconda3/envs/Task2/news/Fake news Detection Dataset/Fake.csv')
data_true = pd.read_csv('/home/hooria.najeeb@vaival.tech/miniconda3/envs/Task2/news/Fake news Detection Dataset/True.csv')

In [10]:

data_fake.head()
data_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [11]:
data_true['class'] = 1
data_fake['class'] = 0

In [12]:
data_fake.shape

(23481, 5)

In [13]:
data_true.shape

(21417, 5)

In [14]:
data_merged = pd.concat([data_true,data_fake],axis=0)
data_merged.head(10)

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017",1
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017",1
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017",1
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017",1
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017",1


In [15]:
# merging data frames
data_merged.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [16]:
data = data_merged.drop(['title',  'subject', 'date'], axis=1)
data.isnull().sum()

text     0
class    0
dtype: int64

In [17]:
# shuffling data
from sklearn.utils import shuffle

data = shuffle(data)
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,text,class
0,WASHINGTON (Reuters) - A Republican bill to di...,1
1,LONDON (Reuters) - Britain is looking at all w...,1
2,WASHINGTON (Reuters) - The State Department ex...,1
3,Really Andrew? Does all of New York really sta...,0
4,Liberal lunacy has taken over America! We have...,0


# Step 2 === Preprocessing 
- lemmatization
- lowering text 
- removing stopwords
- removing special characters
- tokenizing 

In [18]:
from  nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [19]:
print (lemmatizer)

<WordNetLemmatizer>


In [20]:
import re 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess(text):
    tokens = word_tokenize(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')])
    return text

data['text'] = data['text'].apply(preprocess)

# Step 3 === Splitting data

In [21]:
from sklearn.model_selection import train_test_split
X= data['text']
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4 === Vectorization

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


# Step 5 === Model training 

In [23]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)

# Step 6 === Evaluating Model

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy  = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9378619153674833
Confusion Matrix:
[[4470  272]
 [ 286 3952]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      4742
           1       0.94      0.93      0.93      4238

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



# Step 7 === Saving model

In [26]:
import joblib 
joblib.dump(model, 'news_model.pkl')           # Save the trained model
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')  

['tfidf_vectorizer.pkl']

# Step 8 ==== Load model

In [27]:


import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the saved model and vectorizer
model = joblib.load('news_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Step 9 === Testing 

In [31]:
test_text = "NASA's Perseverance Rover Successfully Lands on Mars"
#test_text = "Breaking News: The stock market crashes as economy shows signs of slowing down."

In [None]:
test_vector = vectorizer.transform([test_text])
prediction = model.predict(test_vector)
#print(f"The prediction for the test text is: {prediction}")
#prediction_prob = model.prediction_prob(test_vector)
print(f"The prediction for the test text is: {prediction}")

if prediction[0]==1:
    prediction_result = "Fake"
    #prob = prediction_prob[0][0]*100
else:
    prediction_result = "Real"
    #prob = prediction_prob[0][1]*100
print(f"Prediction : {prediction_result}")# with probability {prob: .2f}% ")  # Output: Prediction : Real with probability

The prediction for the test text is: [0]
Prediction : Real
