In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [3]:
df = pd.read_csv("/Users/dsp009/Downloads/fake_real_news_78k.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,TRUE
1,1,,Did they post their votes for Hillary already?,TRUE
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",TRUE
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,FAKE
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",TRUE


In [5]:
df_False, df_True = [x for _, x in df.groupby(df['label'] == "TRUE")]

In [6]:
df_True.shape

(40277, 4)

In [7]:
df_False.shape

(38192, 4)

In [8]:
df_False["class"] = 0
df_True["class"] = 1

In [9]:
df_merge = pd.concat([df_False, df_True], axis =0 )
df_merge.head(10)
#df_merge.tail(10)

Unnamed: 0.1,Unnamed: 0,title,text,label,class
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,FAKE,0
11,11,"May Brexit offer would hurt, cost EU citizens ...",BRUSSELS (Reuters) - British Prime Minister Th...,FAKE,0
12,12,Schumer calls on Trump to appoint official to ...,"WASHINGTON (Reuters) - Charles Schumer, the to...",FAKE,0
14,14,No Change Expected for ESPN Political Agenda D...,As more and more sports fans turn off ESPN to ...,FAKE,0
15,15,Billionaire Odebrecht in Brazil scandal releas...,RIO DE JANEIRO/SAO PAULO (Reuters) - Billionai...,FAKE,0
17,17,U.N. seeks humanitarian pause in Sanaa where s...,GENEVA (Reuters) - The United Nations called o...,FAKE,0
19,19,Second judge says Clinton email setup may have...,NEW YORK (Reuters) - A second federal judge ha...,FAKE,0
26,26,Supreme Court Won’t Hear Appeal From Texas on ...,WASHINGTON — The Supreme Court rejected on ...,FAKE,0
27,27,Islamic State driven out of last stronghold in...,BAGHDAD (Reuters) - Iraqi forces announced on ...,FAKE,0
28,28,Senators Propose Giving States Option to Keep ...,WASHINGTON — Several Republican senators on...,FAKE,0


In [10]:
df_merge.columns

Index(['Unnamed: 0', 'title', 'text', 'label', 'class'], dtype='object')

In [11]:
df_merge_drop = df_merge.drop(["Unnamed: 0","title","label"], axis=1)

In [12]:
df_merge_drop.isnull().sum()

text     39
class     0
dtype: int64

In [13]:
df_merge_drop = df_merge_drop.sample(frac = 1)
df_merge_drop.head(10)

Unnamed: 0,text,class
8253,But..b but it isn t fair to ask for voter ID s...,1
72492,The Disappearing Middle: Electorate Way Less M...,1
54579,WASHINGTON (Reuters) - Republican U.S. Senator...,0
65489,TOKYO (Reuters) - A court in Japan on Friday o...,0
58382,WASHINGTON (Reuters) - A short-term fix to fun...,0
23205,Hillary Clinton Waiting In Wings Of Stage Sinc...,1
44574,What s a million dollars between friends? To h...,1
65875,No one has passed for more yardage or more tou...,0
29826,ATHENS (Reuters) - Greece said on Monday it wo...,0
5980,Support Us The Arrivals Bosanski Prijevod 15-h...,1


In [14]:
df_merge_drop.reset_index(inplace = True)
df_merge_drop.drop(["index"], axis = 1, inplace = True)

In [15]:
df_merge_drop.columns
df_merge_drop.head()

Unnamed: 0,text,class
0,But..b but it isn t fair to ask for voter ID s...,1
1,The Disappearing Middle: Electorate Way Less M...,1
2,WASHINGTON (Reuters) - Republican U.S. Senator...,0
3,TOKYO (Reuters) - A court in Japan on Friday o...,0
4,WASHINGTON (Reuters) - A short-term fix to fun...,0


In [80]:
df_merge_drop['text'][0]  # First row in the 'text' column


'butb but it isn t fair to ask for voter id says the party that dominates the states that refuse to cooperate with trump s election commission the more corruption that s exposed through postelection voting investigations the more we begin to understand why so many democrat pundits and liberal media outlets were so sure hillary had the election in the bageleven counties in california have more registered voters than voting age adults in the countyjudicial watch announced it sent a noticeofviolation letter to the state of california and  of its counties threatening to sue in federal court if it does not clean its voter registration lists as mandated by the national voter registration act nvra both the nvra and the federal help america vote act require states to take reasonable steps to maintain accurate voting rolls the august  letter was sent on behalf of several judicial watch california supporters and the election integrity project california incin the letter judicial watch noted that

In [16]:
def wordopt(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing digits
    return text


In [17]:
from string import digits

# Create a translation table to remove digits
remove_digits = str.maketrans('', '', digits)

# Apply the translation to the 'text' column
df_merge_drop["text"] = df_merge_drop["text"].astype(str).apply(lambda x: x.translate(remove_digits))


In [18]:
df_merge_drop["text"] = df_merge_drop["text"].apply(wordopt)

In [19]:
x = df_merge_drop["text"]
y = df_merge_drop["class"]

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [22]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train,y_train)

In [23]:
pred_lr=LR.predict(xv_test)

In [24]:
LR.score(xv_test, y_test)

0.8723621164236925

In [25]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.90      0.83      0.86      9573
           1       0.85      0.91      0.88     10045

    accuracy                           0.87     19618
   macro avg       0.87      0.87      0.87     19618
weighted avg       0.87      0.87      0.87     19618



In [43]:
from sklearn.tree import DecisionTreeClassifier

# Optimize parameters
DT = DecisionTreeClassifier()

# Fit the model
DT.fit(xv_train, y_train)

In [44]:
pred_dt = DT.predict(xv_test)

In [45]:
DT.score(xv_test, y_test)

0.7759200734019778

In [46]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.76      0.80      0.78      9573
           1       0.80      0.76      0.78     10045

    accuracy                           0.78     19618
   macro avg       0.78      0.78      0.78     19618
weighted avg       0.78      0.78      0.78     19618



In [48]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(
    n_estimators=50,
    max_depth=3,
    learning_rate=0.1,
    random_state=0
)
GBC.fit(xv_train, y_train)

In [49]:
pred_gbc = GBC.predict(xv_test)

In [50]:
GBC.score(xv_test, y_test)

0.8662452849423998

In [51]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.93      0.79      0.85      9573
           1       0.82      0.94      0.88     10045

    accuracy                           0.87     19618
   macro avg       0.88      0.86      0.86     19618
weighted avg       0.87      0.87      0.87     19618



In [52]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

KeyboardInterrupt: 

In [57]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(max_depth=10, random_state=0)
RFC.fit(xv_train, y_train)

In [58]:
pred_rfc = RFC.predict(xv_test)

In [59]:
RFC.score(xv_test, y_test)

0.7971760627994698

In [60]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.85      0.71      0.77      9573
           1       0.76      0.88      0.82     10045

    accuracy                           0.80     19618
   macro avg       0.81      0.80      0.79     19618
weighted avg       0.80      0.80      0.80     19618



In [77]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)

    print("\n\nLR Prediction: {}".format(output_lable(pred_LR[0])))
    print("\n\nRFC Prediction: {}".format(output_lable(pred_RFC[0])))
    print("\n\nGBC Prediction: {}".format(output_lable(pred_GBC[0])))
    print("\n\nDT Prediction: {}".format(output_lable(pred_DT[0])))

In [78]:
#news = str(input())
news = " Indian Politics is a complex and diverse system that encompasses a range of political ideologies, parties, and interests, with the potential to significantly impact the nation's social, economic, and geopolitical landscape. The Indian political system is based on a federal parliamentary democratic model, with power being distributed between the central government and the various state governments. Political parties play a crucial role in shaping the nation's policies and governance, with the Indian National Congress and the Bharatiya Janata Party being two of the most prominent political parties in the country. However, Indian politics also faces various challenges such as corruption, caste-based politics, communal tensions, and regionalism, which often pose significant obstacles to effective governance and political stability."
manual_testing(news)



LR Prediction: Not A Fake News


RFC Prediction: Not A Fake News


GBC Prediction: Not A Fake News


DT Prediction: Not A Fake News


In [79]:
news = 'tokyo reuters  a court in japan on friday ordered tokyo electric power tepco to pay compensation to a group of former fukushima residents the second such ruling following the  earthquake and nuclear disaster japanese media reported however the ruling by the chiba district court east of tokyo did not find the government liable for compensation in contrast to a march ruling in another court that ordered both the government and tepco to pay compensation to a separate group of evacuees tepco is facing mounting legal claims over the disaster with about  former fukushima residents filing about  similar class action lawsuits seeking compensation media reports said in the chiba case a group of  residents sought damages totaling about  billion yen  million for the emotional distress of fleeing their homes as radiation spread from the meltdowns at tepco s fukushima daiichi plant after an earthquake and tsunami more than six ago tepco was ordered to pay a total of  million yen  million for  of the evacuees kyodo and jiji reported tepco on friday said it would review the contents of the ruling before making a response some  people died in march  when three reactors at the fukushima daiichi plant suffered meltdowns after a magnitude  earthquake triggered a tsunami that devastated a swathe of japan s northeastern coastline tepco has long been criticized for ignoring the threat posed by natural disasters to the fukushima plant and both the company and government were lambasted for their handling of the crisis  in december the government nearly doubled its projections for costs related to the disaster to  trillion yen  billion increasing pressure on tepco to step up reform and improve its performance '
manual_testing(news)



LR Prediction: Fake News


RFC Prediction: Fake News


GBC Prediction: Fake News


DT Prediction: Fake News


In [81]:
news='butb but it isn t fair to ask for voter id says the party that dominates the states that refuse to cooperate with trump s election commission the more corruption that s exposed through postelection voting investigations the more we begin to understand why so many democrat pundits and liberal media outlets were so sure hillary had the election in the bageleven counties in california have more registered voters than voting age adults in the countyjudicial watch announced it sent a noticeofviolation letter to the state of california and  of its counties threatening to sue in federal court if it does not clean its voter registration lists as mandated by the national voter registration act nvra both the nvra and the federal help america vote act require states to take reasonable steps to maintain accurate voting rolls the august  letter was sent on behalf of several judicial watch california supporters and the election integrity project california incin the letter judicial watch noted that public records obtained on the election assistance commission s  election administration voting survey and through verbal accounts from various county agencies show  california counties have more registered voters than votingage citizens imperial  lassen  los angeles  monterey  san diego  san francisco  san mateo  santa cruz  solano  stanislaus  and yolo in the letter judicial watch noted that los angeles county officials  informed us that the total number of registered voters now stands at a number that is a whopping  of the total number of resident citizens of voting age remember when the huffington post denied voter fraud was an actual thing in america while calling hillary clinton the candidate who clearly benefitted from illegal voting practices the  legitimate presidentelect under section  of the nvra states are required to make a reasonable effort to remove the names of ineligible voters from official lists due to  the death of the registrant  or  a change in the residence of the registrant  and requires states to ensure noncitizens are not registered to votethere is  strong circumstantial evidence that california municipalities are not conducting reasonable voter registration list maintenance as mandated under the nvra  judicial watch wrote in the notice letter sent to california secretary of state alex padilla via judicial watch'
manual_testing(news)



LR Prediction: Not A Fake News


RFC Prediction: Not A Fake News


GBC Prediction: Not A Fake News


DT Prediction: Not A Fake News


In [82]:
news='July 2024 - Pennsylvania Incident: During a campaign rally in Butler, Pennsylvania, a gunman fired eight rounds, injuring Trump with a graze to his ear. One audience member was killed, and two others were critically injured. The assailant was killed by Secret Service agents. This assassination attempt led to significant security reforms for Trumps campaign events​.'
manual_testing(news)



LR Prediction: Not A Fake News


RFC Prediction: Not A Fake News


GBC Prediction: Not A Fake News


DT Prediction: Not A Fake News
