In [257]:
import re
import numpy as np
import pandas as pd
import nltk
#nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline
import string

In [258]:
df = pd.read_csv('All_train_data.csv').head(1000)

In [259]:
punctuations = string.punctuation
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()


In [260]:
class Predictors(TransformerMixin):
  def transform(self, Sentence, **transform_params):
    return [clean_text(text) for text in Sentence]
  def fit(self, X, y=None, **fit_params):
    return self
  def get_params(self, deep=True):
    return {}


def clean_text(text):
  return text.strip().lower()
def tokenize(sentence):
  words = nltk.word_tokenize(sentence)
  pos_tags = nltk.pos_tag(words)
  tokens = [lemmatizer.lemmatize(word).lower().strip() if(tag!='PRP' or tag!='PRP$') else word.lower().strip() for word, tag in pos_tags]
  tokens = [tok for tok in tokens if(tok not in stop_words and tok not in punctuations)]
  return tokens
  # for word, pos in pos_tags:
  #   print(word, pos)

In [261]:
vectorizer = CountVectorizer(tokenizer = tokenize, ngram_range=(1, 1))
classifier = LinearSVC()

pipe = Pipeline([('cleaner', Predictors()), ('vectorizer', vectorizer), ('classifier', classifier)])

In [262]:
X_train, X_test, y_train, y_test = train_test_split(df['input'], df['labels'], train_size=0.5, random_state=15)

In [263]:
pipe.fit(X=X_train, y=y_train)
pred_data = pipe.predict(X_test) 



In [264]:
Score = accuracy_score(y_test, pred_data)
f_score = f1_score(y_test, pred_data)
conf = confusion_matrix(y_test, pred_data)
print(f"Accuracy Score is :{Score}")
print(f"F1 Score is :{f_score}")
print(f"Confusion Matrix Score is :\n{conf}")

Accuracy Score is :0.468
F1 Score is :0.4784313725490196
Confusion Matrix Score is :
[[112 147]
 [119 122]]
