# Dataset Description


- train.csv: A full training dataset with the following attributes

    id: unique id for a news article

    title: the title of a news article
    
    author: author of the news article

    text: the text of the article; could be incomplete

    label: a label that marks the article as potentially unreliable

        1: unreliable
        0: reliable

test.csv: A testing dataset with all the same attributes at train.csv without the label 

#  Importing the Dependencieses


In [None]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from tpot import TPOTClassifier
from pycaret.classification import *



import warnings
warnings.filterwarnings("ignore")


In [None]:
nltk.download('stopwords')

In [None]:
print (stopwords.words('english'))

# Exploratory Data Analysis¶


In [None]:
train_news = pd.read_csv('train.csv')
train_news.head()

In [None]:
test_news = pd.read_csv('test.csv')
test_news.head()

In [None]:
train_news.shape

In [None]:
test_news.shape


- stropwords are those words which dosen't add much value to a paragraph or text can be the words such as (the, a , you , where, what ,when, etc...)



In [None]:
train_news.isnull().sum()


In [None]:
test_news.isnull().sum()


# Data Pre-processing


In [None]:
# replacing the null values with empty string
train_news = train_news.fillna(' ')
test_news = test_news.fillna(' ')

In [None]:
# merging the author name and news title
train_news['content'] = train_news['author']+' '+train_news['title']
test_news['content'] = test_news['author']+' '+test_news['title']

In [None]:
print(train_news['content'])

In [None]:
print(test_news['content'])

# Stemming:

Stemming is the process of reducing a word to its Root word

example:
actor, actress, acting --> act

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('^a-zA-Z', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
train_news['content'] = train_news['content'].apply(stemming)
test_news['content'] = test_news['content'].apply(stemming)


In [None]:
print(train_news['content'])

In [None]:
print(test_news['content'])

In [None]:
#separating the data and label
X = train_news['content'].values
Y = train_news['label'].values


In [None]:
print(X)


In [None]:
print(Y)

In [None]:
Y.shape

# Converting the Textual Data to Numerical Data


In [None]:
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train_news['content'].values)
X = transformer.fit_transform(counts)

In [None]:
Y = train_news['label'].values
test_counts = count_vectorizer.transform(test_news['content'].values)
test_tfidf = transformer.fit_transform(test_counts)

In [None]:
print(X)

In [None]:
X.shape

In [None]:
print(test_tfidf)

In [None]:
test_tfidf.shape

# Spliting the data into training and testing data


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=42)

In [None]:
print(f"Number of Training dataset: {X_train.shape[0]}\nNumber of Testing dataset: {X_test.shape[0]}")


In [None]:
def kfolds(model, model_name):
    model = cross_val_score(model, X,Y, cv=10)
    model_score = np.average(model)
    print(f"{model_name} score on cross validation: {model_score * 100}%")

def train(model, model_name):
    model.fit(X_train, Y_train)
    model_train_score = model.score(X_train, Y_train)
    model_test_score = model.score(X_test, Y_test)
    print(f"{model_name} model score on Training data: {model_train_score * 100}%\n{model_name} model score on Testing data: {model_test_score * 100}%")

def conf_matrix(model):
    Y_pred = model.predict(X_test)
    cm = confusion_matrix(Y_test, Y_pred)
    plt.figure(figsize=(8,5))
    sns.heatmap(cm, annot=True, fmt='.2f');
    
def class_report(model):
    Y_pred = model.predict(X_test)
    print(classification_report(Y_test, Y_pred))
    
def roc(model, model_name):
    Y_pred_svm = model.decision_function(X_test)
    svm_fpr, svm_tpr, _ = roc_curve(Y_test, Y_pred_svm)
    auc_svm = auc(svm_fpr, svm_tpr)
    plt.figure(figsize=(5, 5), dpi=100)
    plt.plot(svm_fpr, svm_tpr, color="darkorange", label=f'{model_name} (auc = %0.3f)' % auc_svm)
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.xlabel('False Positive Rate -->')
    plt.ylabel('True Positive Rate -->')
    plt.legend()
    plt.show()

# Logistic Regression


In [None]:
log_model = LogisticRegression()
train(log_model, "Logistic Regression")

In [None]:
conf_matrix(log_model)

In [None]:
class_report(log_model)

In [None]:
roc(log_model, 'Logistic_Regression')

# SVM Model¶


In [None]:
svm_model = SVC()
train(svm_model, 'SV_classifier')

In [None]:
conf_matrix(svm_model)


In [None]:
class_report(svm_model)

In [None]:
roc(svm_model, 'SV_classifier')

# Decision Tree Classifier Model¶


In [None]:
dt_model = DecisionTreeClassifier()
kfolds(dt_model, "Decision_Tree")
train(dt_model, "Decision_Tree")

In [None]:
conf_matrix(dt_model)


In [None]:
class_report(dt_model)


# Random Forest Classifier¶


In [None]:
rf_model = RandomForestClassifier()
train(rf_model, "Random_Forest")

In [None]:
conf_matrix(rf_model)


In [None]:
class_report(rf_model)


# K-nearest neighbors

In [None]:
knn = KNeighborsClassifier()
train(knn, "KNN")

In [None]:
conf_matrix(knn)


In [None]:
class_report(knn)


# Naive Bayes


In [None]:
naive_bayes = MultinomialNB()
kfolds(naive_bayes, "Naive_Bayes")
train(naive_bayes, "Naive_Bayes")

In [None]:
conf_matrix(naive_bayes)


In [None]:
class_report(naive_bayes)


# Automated ML


In [None]:
# autoML = TPOTClassifier(generations=10, population_size=200, cv=20, verbosity=2, config_dict = 'TPOT sparse')
# train(autoML, "Automated ML")


In [None]:
# conf_matrix(autoML)


In [None]:
# class_report(autoML)


In [None]:
# tp(autoML, "Automated Machine Learning")


In [None]:
autoML = setup(X, target=Y)
best = compare_models()

In [None]:
print(best)

In [None]:
evaluate_model(best)

In [None]:
plot_model(best, plot = 'confusion_matrix')

### Logistic Regression Prediction





In [None]:
# Make predictions --> Logistic Regression
predictions = log_model.predict(test_tfidf)
# Submissions
test_ID = test_news.id
submission = pd.DataFrame({'id' : test_ID, 'label' : predictions})

- The submission file will consist of the ID column and a label column. We can just copy the ID column from the test file, make it a dataframe, and then add our label column.



In [None]:
# submission = test_news.id.copy().to_frame()
# submission['prediction'] = predictions

In [None]:
submission.head()


In [None]:
submission.to_csv("./Logistic Regression.csv", index=False) # Convert the submissions to .csv


### dt Prediction



In [None]:
# Make predictions --> 
predictions = dt_model.predict(test_tfidf)
# Submissions
test_ID = test_news.id
dt_submission = pd.DataFrame({'id' : test_ID, 'label' : predictions})

In [None]:

dt_submission.head()


In [None]:
dt_submission.to_csv("./Decision Tree.csv", index=False) # Convert the submissions to .csv
