In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import nltk


!pip install scikit-plot


import re
import string

import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt

from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import gensim
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

from nltk.corpus import stopwords



### Data Set-Up

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
Y_train = pd.read_csv('Y_train.csv')
Y_test = pd.read_csv('Y_test.csv')
X_train.shape

(3599999, 3)

### TF-IDF vectorization

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectoriser = TfidfVectorizer()


In [4]:
X_train = X_train.fillna(X_train.mode().iloc[0])
X_train.isnull().sum()

Unnamed: 0    0
title         0
text          0
dtype: int64

In [5]:
X_test = X_test.fillna(X_test.mode().iloc[0])
X_test.isnull().sum()

Unnamed: 0    0
title         0
text          0
dtype: int64

In [6]:
tfidf_vectoriser.fit(X_train['text'])
X_train_tf = tfidf_vectoriser.transform(X_train['text'])
X_test_tf = tfidf_vectoriser.transform(X_test['text'])
X_train_tf.shape, X_test_tf.shape                        

((3599999, 2198346), (399999, 2198346))

In [7]:
Y_train = Y_train.drop("Unnamed: 0", axis=1)

In [8]:
print(f'No. of Features:{X_train_tf.shape[1]}')

No. of Features:2198346


### Naive Bayes

In [9]:
#MultinomialNB
# Build the classifier
multinomial_nb_model = MultinomialNB()

#  Train the classifier
multinomial_nb_model.fit(X_train_tf, np.ravel(Y_train))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
#BernoulliNB
# Build the classifier
bernoulli_nb_model = BernoulliNB()

#  Train the classifier
bernoulli_nb_model.fit(X_train_tf, Y_train)

  y = column_or_1d(y, warn=True)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

### Logistic Regression

In [11]:
pipe = Pipeline([('standardscaler', StandardScaler(with_mean=False)),
                 ('logisticregression', LogisticRegression(random_state=42,solver='lbfgs', max_iter=1000))])
pipe.fit(X_train_tf, Y_train)

  y = column_or_1d(y, warn=True)


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=False, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=42,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

### Linear SVC

In [13]:
linear_svc_model = LinearSVC()
linear_svc_model.fit(X_train_tf, Y_train)

  y = column_or_1d(y, warn=True)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_tf, Y_train) 

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train_tf, Y_train) 


### Performance Evaluation

In [None]:
def report(model,X,Y,modelName,train=True):
    YPred = model.predict(X)
    if train:
        print(f"{modelName} Training Data Result:\n================================================")
    else:
        print(f"{modelName} Test Data Result:\n================================================")
    print(f"Accuracy Score: {accuracy_score(Y, YPred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"Confusion Matrix: \n {confusion_matrix(Y, YPred)}\n")
    print("_______________________________________________")
    clf_report = pd.DataFrame(classification_report(Y, YPred, output_dict=True))
    print(f"CLASSIFICATION REPORT:\n{clf_report}")
    skplt.metrics.plot_confusion_matrix(Y, YPred)

In [None]:
report(multinomial_nb_model,X_train_tf,Y_train,'Multinomial NB Model')

In [None]:
report(bernoulli_nb_model,X_train_tf,Y_train,'Bernoulli NB Model')

In [None]:
report(pipe,X_train_tf,Y_train,'Logistic regression Model')

In [None]:
report(linear_svc_model,X_train_tf,Y_train,'Linear SVC Model')

In [None]:
report(model_dt,X_train_tf,Y_train,'Decision trees Model')

In [None]:
report(model_rf,X_train_tf,Y_train,'Random forest Model')

### Test Data 

In [None]:
report(multinomial_nb_model,X_test_tf,Y_test,'Multinomial NB Model',False)

In [None]:
report(bernoulli_nb_model,X_test_tf,Y_test,'Bernoulli NB Model',False)

In [None]:
report(pipe,X_test_tf,Y_test,'Logistic NB Model',False)

In [None]:
report(linear_svc_model,X_test_tf,Y_test,'Linear SVC Model',False)

In [None]:
report(model_dt,X_test_tf,Y_test,'Decision trees Model')

In [None]:
report(model_rf,X_test_tf,Y_test,'Random Forest Model')

### 2.Doc2Vev


In [None]:
#Lancaster Stemmer
ls = nltk.LancasterStemmer()
X_train['title'] = X_train['title'].apply(lambda x: " ".join(ls.stem(x) for x in x.split()))
X_train['text'] = X_train['text'].apply(lambda x: " ".join(ls.stem(x) for x in x.split()))

### Feature Extraction

In [None]:
def feature_extract(dataset):
    data = dataset['title'] + ' ' + dataset['text']
    data = [nltk.word_tokenize(l) for l in data]
    tag_doc = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]
    model = Doc2Vec(tag_doc)
    return np.array([model.dv[i] for i in range(len(model.dv))])

train_array = feature_extract(X_train)

In [None]:
print(f'No. of Features {train_array.shape[1]}')

### Model Comparison - Decision Tree,Random Forest,Naive Bayes,Logistic Regression

In [None]:
#BernoulliNB
# Build the classifier
bernoulli_nb_model = BernoulliNB()

#  Train the classifier
bernoulli_nb_model.fit(train_array, Y_train)

In [None]:
log_regg = LogisticRegression(random_state=42,solver='lbfgs', max_iter=1000)
log_regg.fit(train_array, Y_train)

In [None]:
linear_svc_model = SVC(kernel='linear')
linear_svc_model.fit(train_array, Y_train)

In [None]:
deci_tree = DecisionTreeClassifier(random_state=23,max_depth=15).fit(train_array,Y_train)

In [None]:
rand_forest = RandomForestClassifier(random_state=25,max_depth=10,n_jobs=-1).fit(train_array,Y_train)

### Performance Evaluation

### Training Data

In [None]:
report(bernoulli_nb_model,train_array,Y_train,'Bernoulli NB Model')

In [None]:
report(log_regg,train_array,Y_train,'Logistic Regression Model')

In [None]:
report(linear_svc_model,train_array,Y_test,'Linear SVC Model',False)

In [None]:
report(deci_tree,train_array,Y_train,'Decision Tree')

In [None]:
report(rand_forest,train_array,Y_train,'Random Forest')

### Test Data

In [None]:

test_array = feature_extract(X_test)

In [None]:
report(bernoulli_nb_model,test_array,Y_test,'Bernoulli NB Model',False)

In [None]:
report(log_regg,test_array,Y_test,'Logistic Regression Model',False)

In [None]:
report(linear_svc_model,test_array,Y_test,'Linear SVC Model',False)

In [None]:
report(deci_tree,test_array,Y_test,'Decision Tree',False)

In [None]:
report(rand_forest,test_array,Y_test,'Random Forest',False)