In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
import pickle
import warnings
warnings.simplefilter('ignore')

In [3]:
# Takes all rows from df['body'], cleans them and adds the to the new column df['text']
def preprocess(df):
#   Clean body text and store all cleaned text in new df['text'] column
    text = []
    for index,entry in enumerate(df['body']):
        final = clean_text(entry)
        text.append(final)

    df['text'] = pd.Series(text)
    return df

# Lower -> Tokenize -> Lemmatize -> Remove stop words and non-alpha
# Returns cleaned sentence
def clean_text(entry):
    #     Create tag map for use by the pos tagger
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    entry = entry.lower()
    entry = word_tokenize(entry)
    final = []
    lemmatizer = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = lemmatizer.lemmatize(word,tag_map[tag[0]])
            final.append(word_final)
    return str(final)

def load(name):
    with open(name, 'rb') as f:
        item = pickle.load(f)
    return item

def predict(test_data):
    test_data = preprocess(test_data)
    model = load('../svm_model')
    tfidf = load('../tfidf')

    x = tfidf.transform(test_data['text'])
    y = model.predict(x)
    return y - 1

### Load test data

In [4]:
test_data = pd.read_csv('test_sentences.csv', encoding="ISO-8859-1")
test_data.sample(10)

Unnamed: 0,body,class
1,Fucking trans people.,-1
6,"Trans women are men, but men who date them are...",-1
36,The waves were crashing on the shore; it was a...,0
43,Sometimes it is better to just walk away from ...,0
0,Just because you feel like a women [sic] doesn...,-1
19,Trans person is a good person,1
40,The clock within this blog and the clock on my...,0
48,I often see the time 11:11 or 12:34 on clocks.,0
30,Last Friday in three weekâs time I saw a spo...,0
11,You [trans people] are kidding yourselves if y...,-1


In [7]:
test_data['class'].value_counts()

 0    30
 1    15
-1    15
Name: class, dtype: int64

## Predict

In [134]:
predicted = predict(test_data)
target = test_data['class']


### Model performance

In [135]:
accuracy_score(predicted, target)

0.8666666666666667

In [139]:
print(classification_report(predicted, target))

              precision    recall  f1-score   support

          -1       0.47      1.00      0.64         7
           0       1.00      0.86      0.92        35
           1       1.00      0.83      0.91        18

   micro avg       0.87      0.87      0.87        60
   macro avg       0.82      0.90      0.82        60
weighted avg       0.94      0.87      0.89        60

