In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

data = pd.read_csv('/kaggle/input/pheme-dataset-for-rumour-detection/dataset.csv')
data

data.info()

data.shape

# Data Preprocessing
# Handle missing values in 'is_rumor' column by dropping or filling them
data = data.dropna(subset=['is_rumor'])
data

# Filter the DataFrame to only include rows where 'is_rumor' is 1 or 0
filtered_data = data[data['is_rumor'].isin([1, 0])]

# Count the values of 'is_rumor'
rumor_counts = filtered_data['is_rumor'].value_counts()

print("Count of is_rumor values:")
print(rumor_counts)

# Convert text to lowercase and remove unnecessary characters
data.loc[:, 'text'] = data['text'].str.lower().str.replace('[^\w\s]', '', regex=True)
data['text']

# Ensure 'is_rumor' is integer type
data.loc[:, 'is_rumor'] = data['is_rumor'].astype(int)
data['is_rumor']

# Train-Test Split
X = data[['text']]
y = data['is_rumor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization and Feature Union
# Text vectorization
text_transformer = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(stop_words='english'))
])

# Scaling numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine text and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'text')
    ]
)

# Model Pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression( max_iter=1000))
])

# Model Training
model.fit(X_train, y_train)

# Model Evaluation
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print("Accuracy:", (accuracy_score(y_test, predictions).round(2)) * 100,'%')

# Define the metrics list
metrics = ['precision', 'recall', 'f1-score']

# Prepare data for plotting
values_class_0 = [report['0.0'][metric] for metric in metrics] + [accuracy]
values_class_1 = [report['1.0'][metric] for metric in metrics] + [accuracy]

# Plotting
fig, ax = plt.subplots(figsize=(12, 8))

bar_width = 0.2
opacity = 0.8
index = range(len(metrics) + 1)

rects1 = ax.bar([p - bar_width/2 for p in index], values_class_0, bar_width, alpha=opacity, color='darkblue', label='Not Rumor')
rects2 = ax.bar([p + bar_width/2 for p in index], values_class_1, bar_width, alpha=opacity, color='gray', label='Rumor')

ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('Classification Metrics and Accuracy by Class')
ax.set_xticks(index)
ax.set_xticklabels(metrics + ['Accuracy'])
ax.legend()
plt.tight_layout()
plt.show()

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Plot Confusion Matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Rumor', 'Rumor'], yticklabels=['Not Rumor', 'Rumor'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

