# FAKE NEWS DETECTOR 

### IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



### LOAD THE DATASET

In [2]:
train_df = pd.read_csv('C:\\Users\\acer\\Desktop\\fakenws\\Fake_News_Detection-master\\fakenwss\\train.csv')
test_df = pd.read_csv('C:\\Users\\acer\\Desktop\\fakenws\\Fake_News_Detection-master\\fakenwss\\test.csv')
valid_df = pd.read_csv('C:\\Users\\acer\\Desktop\\fakenws\\Fake_News_Detection-master\\fakenwss\\valid.csv')

### DATA PROCESSING

In [3]:
train_df = train_df.rename(columns={train_df.columns[1]: 'label'})
test_df = test_df.rename(columns={test_df.columns[1]: 'label'})
valid_df = valid_df.rename(columns={valid_df.columns[1]: 'label'})

### ADDITIONAL PROCESSING BASED ON THE DATASET

In [4]:
train_df['Statement'] = train_df['Statement'].astype(str)  # Convert to string type
train_df['Statement'] = train_df['Statement'].str.lower()  # Convert text to lowercase
test_df['Statement'] = test_df['Statement'].astype(str)
test_df['Statement'] = test_df['Statement'].str.lower()
valid_df['Statement'] = valid_df['Statement'].astype(str)
valid_df['Statement'] = valid_df['Statement'].str.lower()

### SPLITING THE DATASET INTO FEATURES (X) AND LABELS (Y)

In [5]:
X_train = train_df['Statement']
y_train = train_df['label']  # 'label' should be the column containing labels (real or fake)
X_test = test_df['Statement']
y_test = test_df['label']
X_valid = valid_df['Statement']
y_valid = valid_df['label']

### CONVERTING TEXT DATA INTO NUMERICAL FEATURES

In [6]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)
X_valid = tfidf_vectorizer.transform(X_valid)

### TRAINING THE PassiveAggressiveClassifier WITH A HIGHER VALUE FOR MAX_ITER

In [7]:
pac = PassiveAggressiveClassifier(max_iter=1000)  # Increase max_iter value
pac.fit(X_train, y_train)

### PREDICT ON THE TEST SET

In [8]:
y_pred_test = pac.predict(X_test)

### EVALUATING THE MODEL ON THE TEST SET

In [9]:
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)

Test Accuracy: 0.5488043904351235


### PREDICT ON THE VALIDATION SET

In [10]:
y_pred_valid = pac.predict(X_valid)

### CONVERT PREDICTIONS TO THE SAME DATA TYPE AS y_valid

In [11]:
y_pred_valid = y_pred_valid.astype(str)

### PRINT OUT THE DATA TYPES

In [12]:
print("Data type of y_valid:", y_valid.dtype)
print("Data type of y_pred_valid:", y_pred_valid.dtype)

Data type of y_valid: object
Data type of y_pred_valid: <U5


### INSPECT y_valid

In [13]:
print("Sample of y_valid:", y_valid.head())

Sample of y_valid: 0    FALSE
1    FALSE
2    FALSE
3     TRUE
4     TRUE
Name: label, dtype: object


### CHECKING FOR MISSING VALUES

In [14]:
missing_values = y_valid.isnull().sum()
print("Missing values in y_valid:", missing_values)

Missing values in y_valid: 2


 ### REMOVE ROWS WITH MISSING VALUES IN y_valid

In [15]:
y_valid = y_valid.dropna()

### REMOVING CORRESPONDING ROWS FROM x_valid

In [16]:
X_valid = X_valid[y_valid.index]

###  Print unique values in y_valid and y_pred_valid

In [17]:
unique_values_y_valid = np.unique(y_valid)
unique_values_y_pred_valid = np.unique(y_pred_valid)

In [18]:
print("Unique values in y_valid:", unique_values_y_valid)
print("Unique values in y_pred_valid:", unique_values_y_pred_valid)

Unique values in y_valid: ['FALSE' 'Label' 'TRUE']
Unique values in y_pred_valid: ['False' 'True']


### Print sample of y_valid and y_pred_valid

In [19]:
print("Sample of y_valid:", y_valid.head())
print("Sample of y_pred_valid:", y_pred_valid[:5])  # Print the first 5 predictions for comparison

Sample of y_valid: 0    FALSE
1    FALSE
2    FALSE
3     TRUE
4     TRUE
Name: label, dtype: object
Sample of y_pred_valid: ['True' 'True' 'False' 'True' 'False']


### Predicting on the validation set after handling missing values

In [20]:
y_pred_valid = pac.predict(X_valid)

### Convert predictions to the same data type as y_valid

In [21]:
y_pred_valid = y_pred_valid.astype(str)

###  Convert labels in y_pred_valid to uppercase

In [22]:
y_pred_valid = np.array([label.upper() for label in y_pred_valid])

### Finding indices where label is not equal to 'Label'

In [23]:
valid_indices = y_valid[y_valid != 'Label'].index

 ### Ensure valid_indices are within the bounds of y_valid

In [24]:
valid_indices = valid_indices[valid_indices < len(y_valid)]

### Filter out corresponding rows from y_valid and y_pred_valid

In [25]:
y_valid_filtered = y_valid.iloc[valid_indices]
y_pred_valid_filtered = y_pred_valid[valid_indices]

### Evaluate the model on the filtered validation set

In [26]:
accuracy_valid = accuracy_score(y_valid_filtered, y_pred_valid_filtered)
print("Validation Accuracy (Filtered):", accuracy_valid)

Validation Accuracy (Filtered): 0.5477210751850409
