# Import all the libraries here

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
def snake_columns(data):
    """
    Standarize and returns snake_case columns
    """
    data.columns = [column.lower().replace(' ', '_') for column in data.columns]

# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

# Load the data here

In [None]:
data = pd.read_csv("../dataset/training_data.csv", sep='\t', names=["label", "text"])
df = data.copy()

In [None]:
df.shape

In [None]:
df.head(10)

# Exploratory Data Analysis

In [None]:
# null values
df.isnull().sum()

In [None]:
# distribution of label
sns.countplot(x=data["label"])
plt.show()

In [None]:
data["label"].value_counts()

**Overview**
- There is no null value in the dataset.
- `label` has two values: 0 (fake news) & 1 (real news)
- `label` has almost equal values of 0 and 1. We might need to remove some 0 values to unbias the output.
- Number of rows: 34152 

In [None]:
# sending target to the last
target = data.pop("label")
data["label"] = target

In [None]:
data

**Comment**
- The whole dataset is my training data. So I don't need to do train_test_split.

# Feature Transformation

In [None]:
train_data = data

In [None]:
# cleaning string characters
train_data.iloc[:, 0].replace("[^a-zA-Z]", " ", regex=True, inplace=True)

In [None]:
train_data["text"]

In [None]:
train_data["text"] = train_data["text"].astype(str)

In [None]:
train_data.info()

In [None]:
train_headlines = []
for row in range(0, len(train_data.index)):
    train_headlines.append(train_data.iloc[row, 0])

In [None]:
train_headlines

# Count Vectorizer

In [None]:
countvector = CountVectorizer(ngram_range=(2, 2))

In [None]:
traindataset = countvector.fit_transform(train_headlines)

In [None]:
traindataset

# Model: Random Forest

In [None]:
train_data.head(3)

In [None]:
traindataset[:5]

In [None]:
traindataset[:5].toarray()

In [None]:
ground_truth = train_data["label"]

In [None]:
# Implement RandomForest Classifier with additional parameters
randomclassifier = RandomForestClassifier(n_estimators=200, 
criterion='entropy', max_depth=10, random_state=42)

# n_estimators: number of trees in the forest
# criterion: function to measure the quality of a split. Entropy is the measure of impurity
# max_depth: maximum depth of the tree
# random_state: seed for random number generator

# Model training

In [None]:
randomclassifier.fit(traindataset, ground_truth)

# Cross validation

In [None]:
# Evaluate the model using cross-validation
scores = cross_val_score(randomclassifier, traindataset, ground_truth, cv=5)

In [None]:
print(f"Cross-validation scores: {scores}")
print(f"Mean cross-validation score: {scores.mean()}")

# Predictions

In [None]:
# load the testing data
test_data = pd.read_csv("../dataset/testing_data.csv", sep='\t', names=["label", "text"])


In [None]:
test_data.shape

In [None]:
test_data.head(2)

In [None]:
test_data["label"].value_counts()

In [None]:
test_headlines = []
for row in range(0, len(test_data.index)):
    test_headlines.append(test_data.iloc[row, 1])

In [None]:
test_headlines

In [None]:
testdataset = countvector.transform(test_headlines)

In [None]:
# Make predictions on the test data
predictions = randomclassifier.predict(testdataset)

In [None]:
predictions

In [None]:
sns.countplot(x=predictions)
plt.show()

In [None]:
print(f"Size of ground truth: {len(test_data["label"])}")
print(f"Size of predictions: {len(predictions)}")

In [None]:
test_data["predictions"] = predictions
test_data

# Model: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model_lr = LogisticRegression()

In [None]:
traindataset

In [None]:
model_lr.fit(traindataset, ground_truth)

In [None]:
# Evaluate the model using cross-validation
scores_lr = cross_val_score(model_lr, traindataset, ground_truth, cv=5)

In [None]:
print(f"Cross-validation scores: {scores_lr}")
print(f"Mean cross-validation score: {scores_lr.mean()}")

In [None]:
# Make predictions on the test data
predictions_lr = model_lr.predict(testdataset)

In [None]:
sns.countplot(x=predictions_lr)
plt.show()

In [None]:
test_data

In [None]:
test_data["predictions_log_reg"] = predictions_lr

In [None]:
test_data.rename({"predictions" : "predictions_rand_for"}, inplace=True)

In [None]:
test_data

In [None]:
# save the files
test_data.to_csv("../dataset/predictions.csv")

**Summary**
- `LogisticRegression` works better than `RandomForest`