In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from nltk.stem import PorterStemmer 
import re
from sklearn.metrics import confusion_matrix



In [2]:
text = pd.read_csv("train.csv")

In [3]:
text = text.drop(["id", "author", "title"], axis = 1)

In [4]:
text["text"] = text["text"].astype(str)

In [5]:
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    s = re.sub("http[\D]+com", " " ,s)
    s = re.sub("[\W]", " " ,s)
    s = re.sub("[\d]", " ", s)
    s = s.lower()
    return s

def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return nltk.word_tokenize(s)

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer ()
    y = [lemmatizer.lemmatize(ps.stem(w)) for w in l]
    return y

def remove_stopwords(l):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    stop_words = stopwords.words('english')
    
    return str([w for w in l if w not in stop_words])


In [6]:
text["processed"] = text["text"].apply(lambda x : clean_up(x))

In [7]:
text["processed"] = text["processed"].apply(lambda x : tokenize(x))

In [8]:
text["processed"] = text["processed"].apply(lambda x : stem_and_lemmatize(x))

In [9]:
text["processed"] = text["processed"].apply(lambda x : remove_stopwords(x))

In [10]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(text["processed"],text['label'],test_size=0.2)

In [11]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [12]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(text['processed'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [13]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Naive Bayes confusion matrix Score -> ",confusion_matrix(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  88.41346153846153
Naive Bayes confusion matrix Score ->  [[193600  30700]
 [ 17500 174200]]


In [14]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("SVM confusion matrix test -> ",confusion_matrix(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  94.66346153846153
SVM confusion matrix test ->  [[198100   9200]
 [ 13000 195700]]


In [15]:
predictions_SVM = SVM.predict(Train_X_Tfidf)

print("SVM Accuracy Score -> ",accuracy_score(Train_Y, predictions_SVM)*100)
print("SVM confusion matrix train -> ",confusion_matrix(predictions_SVM, Train_Y)*100)

SVM Accuracy Score ->  97.890625
SVM confusion matrix train ->  [[809600  17100]
 [ 18000 819300]]


In [None]:
#do the cunfusion matrix
