In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
#import preprocessor as p
import csv, re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.corpus import words
import string
import codecs
from sklearn.model_selection import train_test_split
import itertools
from string import digits
from autocorrect import spell 
import wordninja
from nltk.stem import WordNetLemmatizer

In [13]:
def nb_classify(input_sentences, no_of_classes):
    
    if no_of_classes == 2: 
        dataset_path = 'Data/Datasets/Binary Classification/'
    else:
        dataset_path = 'Data/Datasets/Multiclass Classification/'
        
    clf = joblib.load(dataset_path+'nb_classifier.pkl')
    word_features = joblib.load('Data/nb_word_features.pkl')
    
    pred = np.zeros((len(input_sentences),1))
    for idx,line in enumerate(input_sentences):
        words = nltk.word_tokenize(line)
        featured_item = {i:(i in words) for i in word_features}
        pred[idx]  = int(clf.classify(featured_item))
        
    return pred

In [None]:
def svm_classify(input_sentences, no_of_classes):
    
    if no_of_classes == 2: 
        dataset_path = 'Data/Datasets/Binary Classification/'
    else:
        dataset_path = 'Data/Datasets/Multiclass Classification/'
        
    clf = joblib.load(dataset_path+'svm_classifier.pkl')
    onehot_enc = joblib.load(dataset_path+'svm_encode.pkl')
    vocabulary = joblib.load('Data/svm_vocabulary.pkl')
    
    tokenized_input_sent = [word_tokenize(s) for s in input_sentences]
    
    for line in tokenized_input_sent:
        for w in reversed(line):
            if(w not in vocabulary):
                line.remove(w)
    pred = clf.predict(onehot_enc.transform(tokenized_input_sent))
    
    return pred

In [3]:
def ens_classify(input_sentences, no_of_classes):
    
    if no_of_classes == 2: 
        dataset_path = 'Data/Datasets/Binary Classification/'
    else:
        dataset_path = 'Data/Datasets/Multiclass Classification/'
        
    clf = joblib.load(dataset_path+'svm_classifier.pkl')
    onehot_enc = joblib.load(dataset_path+'svm_encode.pkl')
    vocabulary = joblib.load('Data/svm_vocabulary.pkl')
    
    tokenized_input_sent = [word_tokenize(s) for s in input_sentences]
    
    for line in tokenized_input_sent:
        for w in reversed(line):
            if(w not in vocabulary):
                line.remove(w)
    svm_pred_prob = clf.predict_proba(onehot_enc.transform(tokenized_input_sent))
    
    clf = joblib.load(dataset_path+'nb_classifier.pkl')
    word_features = joblib.load('Data/nb_word_features.pkl')
    
    nb_pred_prob = np.zeros((len(input_sentences),5))
    for idx,line in enumerate(input_sentences):
        words = nltk.word_tokenize(line)
        featured_item = {i:(i in words) for i in word_features}
        p =clf.prob_classify(featured_item)
        for label in p.samples():
            nb_pred_prob[idx][label] = p.prob(label)
    
    m=np.zeros((len(input_sentences),5))
    for idx,item in enumerate(nb_pred_prob):
        m[idx] = np.average([svm_pred_prob[idx], item], axis=0)
    ensemble_pred = m.argmax(axis=1)
    
    return ensemble_pred

In [9]:
def expand_abbrv(input_string):
    input_string = input_string.split(" ")
    slangs = pd.read_csv("Data/slangs.csv", delimiter=",")
    j = 0
    for _str in input_string:
        _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
        if(_str.upper() in slangs['abbv'].unique()):
            idx = slangs.index[slangs['abbv'] == _str]
            input_string[j] = slangs['fullform'][idx].str.lower().to_string(index = False)
        j = j + 1
    return (' '.join(input_string))

In [10]:
def expand_contractions(input_string):
    input_string = input_string.split(" ")
    contra = pd.read_csv("Data/contractions.csv", delimiter=",")
    j = 0
    for _str in input_string:
        if(_str.lower() in contra['contraction'].unique()):

            idx = contra.index[contra['contraction'] == _str]
            input_string[j] = contra['fullform'][idx].str.lower().to_string(index = False)
        j = j + 1
    return (' '.join(input_string))

In [11]:
def remove_hashtags(input_string):
    input_string = input_string.split(" ")
    j = 0
    for _str in input_string:
        if('#' in _str):
            input_string[j] = _str.replace('#','')
        j = j + 1
    return (' '.join(input_string))

In [12]:
def process_data(dataset):

    
    
    #step 1: remove stop words
    stop = set(stopwords.words('english'))
    not_stop = ["don't","weren't","doesn't","isn't","aren't","not","needn't","won't","hasn't","mightn't","didn't","haven't","hadn't","shouldn't","wasn't","mustn't","couldn't"]
    stop = (set(stop).difference(not_stop))
    dataset = dataset.apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))

    #step 2: no http links, remove @ words
    dataset = dataset.apply(lambda x:' '.join([word for word in x.split() if '@' not in word]))

    #step 3: remove hastags
    dataset = dataset.apply(lambda x: remove_hashtags(x))

    #step 4: expand contractions
    dataset = dataset.apply(lambda x: expand_contractions(x))

    #step 5: expand abbreviations
    dataset = dataset.apply(lambda x: expand_abbrv(x))
    
    #step 6: remove punctuations
    punc = string.punctuation
    dataset = dataset.apply(lambda x: list(x))
    dataset = dataset.apply(lambda x: ''.join([o for o in x if not o in punc]).split())
    dataset = dataset.apply(lambda x: ' '.join(x))

    #step 7: remove nos.
    remove_digits = str.maketrans('', '', digits)
    dataset = dataset.str.translate(remove_digits)
    
    #step 8: shorten elongated words
    dataset = dataset.apply(lambda x: ''.join(''.join(s)[:2] for _, s in itertools.groupby(x)))

    #step 9: all lower case
    dataset = dataset.str.lower()

    #step 10: remove punctuations
    punc = string.punctuation
    dataset = dataset.apply(lambda x: list(x))
    dataset = dataset.apply(lambda x: ''.join([o for o in x if not o in punc]).split())
    dataset = dataset.apply(lambda x: ' '.join(x))           
    
    return dataset