In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
true = pd.read_csv("True.csv")

In [None]:
true.head()

In [None]:
fake = pd.read_csv("Fake.csv")

In [None]:
fake.head()

In [None]:
# Add target column
true['target'] = 1  # or 'true'
fake['target'] = 0  # or 'fake'

# Combine the datasets
df = pd.concat([true,fake], ignore_index=True)

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Save the combined dataset
df.to_csv("combined_dataset.csv", index=False)

In [None]:
df = pd.read_csv("combined_dataset.csv")

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
df.info()

In [None]:
!pip install wordcloud

In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df.info()

In [None]:
sns.countplot(x='target', data=df)
plt.title('Class Distribution')
plt.show()


In [None]:
plt.figure(figsize=(12, 6))  # Width and height in inches
sns.countplot(x='subject', hue='target', data=df)
plt.title('Subject-wise Distribution')
plt.show()

In [None]:
# For Real News
real_words = ' '.join(df[df['target'] == 1]['text'])
real_wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(real_words)
plt.figure(figsize=(10, 7))
plt.imshow(real_wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title("Most Frequent Words in Real News")
plt.show()

In [None]:
# For Fake News
fake_words = ' '.join(df[df['target'] == 0]['text'])
fake_wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(fake_words)
plt.figure(figsize=(10, 7))
plt.imshow(fake_wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title("Most Frequent Words in Fake News")
plt.show()

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Tokenize and remove stopwords
df['cleaned_text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

In [None]:
df['cleaned_text'] 

In [None]:
X = df['cleaned_text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Decision Tree
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train_tfidf, y_train)
y_pred_tree = clf_tree.predict(X_test_tfidf)

In [None]:
# KNN
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train_tfidf, y_train)
y_pred_knn = clf_knn.predict(X_test_tfidf)

In [None]:
# Random Forest
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train_tfidf, y_train)
y_pred_rf = clf_rf.predict(X_test_tfidf)

In [None]:
# Naive Bayes
clf_nb = MultinomialNB()
clf_nb.fit(X_train_tfidf, y_train)
y_pred_nb = clf_nb.predict(X_test_tfidf)

In [None]:
# SVM
clf_svm = SVC()
clf_svm.fit(X_train_tfidf, y_train)
y_pred_svm = clf_svm.predict(X_test_tfidf)

In [None]:
models = ['Decision Tree', 'KNN', 'Random Forest', 'Naive Bayes', 'SVM']
predictions = [y_pred_tree, y_pred_knn, y_pred_rf, y_pred_nb, y_pred_svm]

for i, model in enumerate(models):
    print(f"Performance Metrics for {model}:")
    print("Accuracy:", accuracy_score(y_test, predictions[i]))
    print("Precision:", precision_score(y_test, predictions[i]))
    print("Recall:", recall_score(y_test, predictions[i]))
    print("F1 Score:", f1_score(y_test, predictions[i]))


In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 30, 50, 100],
    'min_samples_split': [2, 5, 10]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           verbose=2,  # print progress
                           n_jobs=-1)  # use all processors

# Fit the model
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best parameters: {best_params}")


In [None]:
# Use the best model to make predictions on the test set
y_pred_final = best_model.predict(X_test_tfidf)

# Evaluate the final model
print("Final Model Performance Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_final))
print("Precision:", precision_score(y_test, y_pred_final))
print("Recall:", recall_score(y_test, y_pred_final))
print("F1 Score:", f1_score(y_test, y_pred_final))