In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English 

from sklearn.model_selection import train_test_split

from sklearn import model_selection, naive_bayes, svm

from sklearn import metrics
from sklearn.metrics import confusion_matrix


In [2]:
df = pd.read_csv("train.csv")
df["text"] = df["text"].astype(str)
text = df.drop(["title", "id"], axis = 1)

In [3]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = English()
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [4]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [5]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [6]:
from sklearn.model_selection import train_test_split

X = text["text"] # the features we want to analyze
ylabels = text['label'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2)

In [17]:
# SVM Classifier

classifier = svm.SVC(C=1.0, kernel='poly', degree=2, gamma='auto')

# Create pipeline using tfidf_vector
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x0000022E9117B888>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x0000022EF3BFDCA8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
           

In [18]:
predicted = pipe.predict(X_test)

# Model Accuracy
print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))
print("SVM Precision:",metrics.precision_score(y_test, predicted))
print("SVM Recall:",metrics.recall_score(y_test, predicted))

SVM Accuracy: 0.49206730769230766
SVM Precision: 0.49206730769230766
SVM Recall: 1.0


In [None]:
import seaborn as sns

In [None]:
sns.scatterplot(x = y_test, y = predicted)

In [None]:
predicted_train = pipe.predict(X_train)

# Model Accuracy
print("SVM Accuracy:",metrics.accuracy_score(y_train, predicted_train))
print("SVM Precision:",metrics.precision_score(y_train, predicted_train))
print("SVM Recall:",metrics.recall_score(y_train, predicted_train))

In [None]:
print("SVM matrix:",confusion_matrix(y_test, predicted))

In [None]:
print("SVM matrix train:",confusion_matrix(y_train, predicted_train))

In [None]:
x_plot = list(range(len(y_test)))

In [None]:
import matplotlib
import matplotlib.pyplot as plt

In [None]:
plt.scatter(x_plot, predicted - y_test, label = 'pred', alpha = 0.5)
plt.scatter(x_plot, y_test, label = 'test', alpha = 0.5)
plt.legend()
plt.show()