In [None]:
#Used Google Colab for creating this script. 
#To run this without errors.Please change path in cells which have this comment: "Includes google drive path"
#All the following packages are needed to run this script without errors

In [None]:
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

import gensim.models as g


In [None]:
import gensim
print(gensim.__version__)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
#Includes google drive path.
#Function: reads csv and prints shape, null values summary column-wise. 
def read_df(df_name):
    df = pd.read_csv('gdrive/My Drive/News Articles Dataset/'+ df_name +'.csv', header = None)
    print('Shape:',df.shape)
    print('Null values')
    print(df.isnull().any())
    return df

In [None]:
df = read_df('train')

In [None]:
#Function: renames and combines headline and content
def rename_combine_cols(df, col1, col2, col3, newcol):
    new_df = df.rename(columns = {0:col1, 1:col2, 2:col3})
    new_df[newcol] = new_df[col2] + ' ' + new_df[col3]
    return new_df

In [None]:
df = rename_combine_cols(df, 'class', 'headline', 'content', 'combined')

In [None]:
df['class'].value_counts().sort_values().plot(kind = 'bar')

In [None]:
df['class'].value_counts()

In [None]:
#Function: removes special characters, punctuations and numbers; tokenize; lemmatize and remove stop words 
def text_preprocess(text):
    text_stripwhitespaces = text.strip()
    text_stripwhitespaces = text_stripwhitespaces.replace("\\", " ")
    text_clean =  re.sub('[~`!@#$%^&*():;"{}_/?><\|.,`0-9]', '', text_stripwhitespaces.replace('-', ' '))
    tokens = word_tokenize(str(text_clean).lower())
    #words = [lemmatizer.lemmatize(word) for word in tokens if not word in stop_words]
    words = [word for word in tokens if not word in stop_words]
    if words[0] == words[-1]:
        words.pop(-1)
    final_text = ' '.join(words)
    return final_text

In [None]:
df['combined'] = df['combined'].apply(lambda x: text_preprocess(x))

In [None]:
result = Counter(" ".join(df['combined'].values.tolist()).split(" ")).items()
x = sorted(list(result), key = lambda x: x[1])
print('No.of words occuring more than 100 times:', len([i[1] for i in x if i[1] >= 100]))

In [None]:
text = df['combined'].iloc[:].values

TFIDF Vectorization

In [None]:
vec = TfidfVectorizer(max_features = 5000, ngram_range = (1, 3), max_df = 0.3)

In [None]:
x_train = vec.fit_transform(text).toarray()
y_train = df['class'].iloc[:].values

Preparing Test Data: Treating the test data with same pre-processing steps as train data

In [None]:
df1 = read_df('test')
df1 = rename_combine_cols(df1, 'class', 'headline', 'content', 'combined')
df1['combined'] = df1['combined'].apply(lambda x: text_preprocess(x))
text = df1['combined'].iloc[:].values
x_test = vec.transform(text).toarray()
y_test = df1['class'].iloc[:].values

In [None]:
#Function: trains model, predicts on train data and test data, gets training and testing accuracy
def train_fit_predict(model, x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    y_trainpred = model.predict(x_train)
    print('Training Accuracy: ', accuracy_score(y_train, y_trainpred))
    y_testpred = model.predict(x_test)
    score = accuracy_score(y_test, y_testpred)
    print('Training Accuracy: ', accuracy_score(y_test, y_testpred))
    return y_test, y_testpred, model

Naive Bayes Model

In [None]:
model = GaussianNB()

In [None]:
y_test, y_testpred, model = train_fit_predict(model, x_train, y_train, x_test, y_test)

In [None]:
#Function: Prints and plots confusion matrix as heatmap 
def plot_conf_matrix(y, yhat, model):
    conf_matrix = confusion_matrix(y.tolist(), yhat.tolist(), labels=[1,2,3,4])
    print(conf_matrix)
    conf_matrix_df = pd.DataFrame(conf_matrix, range(1,5), range(1,5))
    plt.figure(figsize=(10,7))
    ax = plt.axes()
    sns.heatmap(conf_matrix_df,annot=True,fmt='g',cmap='Blues') # font size
    ax.set_title(str(model))
    plt.show()

In [None]:
plot_conf_matrix(y_test, y_testpred, model)

Logistic Regression

In [None]:
logr = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

In [None]:
y_test, y_testpred, model = train_fit_predict(logr, x_train, y_train, x_test, y_test)

In [None]:
plot_conf_matrix(y_test, y_testpred, logr)

Doc2Vec Vectorization using pre-trained doc2vec model

In [None]:
#Pre-trained Doc2Vec model trained on Associated Press News articles
#To verify; model has to be downloaded. Download link: https://github.com/jhlau/doc2vec

In [None]:
#Includes google drive path
d2v_path = 'gdrive/My Drive/doc2vec_pretrained/doc2vec.bin'  

In [None]:
d2v = g.Doc2Vec.load(d2v_path)

In [None]:
#vectorizing train combined text using pre-trained doc2vec model
combinedtext_doc2vec = [d2v.infer_vector(i.split()) for i in df.combined.tolist()] 

In [None]:
x_train_d2v = np.vstack(combinedtext_doc2vec)

In [None]:
np.save('gdrive/My Drive/x_train_d2v.npy', x_train_d2v) 

In [None]:
#vectorizing test combined text using pre-trained doc2vec model
combinedtext_doc2vec1 = [d2v.infer_vector(i.split()) for i in df1.combined.tolist()] 

In [None]:
x_test_d2v = np.vstack(combinedtext_doc2vec1)

In [None]:
np.save('gdrive/My Drive/x_test_d2v.npy', x_test_d2v) 

By vectorizing our text corpus using doc2vec, we have reduced or feature dimensions from 5000 to 300. TFIDF feature vector was a big sparse matrix. But now we have a dense, tightly packed feature vector.

Naive Bayes on Doc2vec features of the news articles

In [None]:
model = GaussianNB()

In [None]:
y_test, y_testpred, model = train_fit_predict(model, x_train_d2v, y_train, x_test_d2v, y_test)

In [None]:
plot_conf_matrix(y_test, y_testpred, model)

Logistic Regression on Doc2vec features of the news articles

In [None]:
y_test, y_testpred, model = train_fit_predict(logr, x_train_d2v, y_train, x_test_d2v, y_test)

In [None]:
plot_conf_matrix(y_test, y_testpred, logr)