In [228]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import re
import hashlib
import numpy as np
import spacy
import string

from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm
from spacy.matcher import Matcher
from sklearn.metrics import accuracy_score, f1_score 
from nltk.tokenize import WordPunctTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [229]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,text,class
0,"Hello, Does it matter iff I use Visa or Master...",card
1,"Good afternoon, I just got refunded for my pur...",card
2,"Hello, I got billed ann extra pound! Thanks",others
3,"Hi, How long does it take for a transfer to sh...",transfer
4,"hi, When can I use money sent to my accountt? ...",transfer


# Preprocessing For text features 

In [230]:
class Preprocessing_text (TransformerMixin):
    def __init__(self, tokenizer, stemmer=None, lower=True, Remove_punc=True, stopwords=None, regex_list=None):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.lower = lower
        self.Remove_punc = Remove_punc
        self.stopwords = stopwords
        self.regex_list = regex_list
        
    def transform (self, df, *_):
        X = list(map(self.clean_sen, df))
        return X
    
    def clean_sen(self, sentence):
        #lower
        if self.lower:
            sentence = sentence.lower()
        
        #remove_punctuation
        if self.Remove_punc:
            sentence = re.sub(r'[^\w\s]', '', sentence)
            
        #remove those in regex_list
        if self.regex_list:
            for regex in self.regex_list:
                sentence = re.sub(regex, '', sentence)
                
        #remove stopwords
        if self.stopwords:
            words = []
            tokenized_words = self.tokenizer.tokenize(sentence)
            for word in tokenized_words:
                if word not in self.stopwords:
                    words.append(word)
            sentence = " ".join(words)
            
        #stemming
        if self.stemmer:
            words = []
            tokenized_words = self.tokenizer.tokenize(sentence)
            for word in tokenized_words:
                if word not in self.stopwords:
                    words.append(self.stemmer.stem(word))
            sentence = " ".join(words)
            
        return sentence
            
    def fit(self, *_):
            return self

In [231]:
def Preprocess_text(df, tokenizer, stemmer=None, lower=True, Remove_punc=True, stopwords=None, regex_list=None):
    cleaner = Preprocessing_text(tokenizer, stemmer=stemmer, lower=lower, Remove_punc=Remove_punc, stopwords=stopwords, regex_list=regex_list)
    
    df_cleaned = df.copy()
    df_cleaned['cleaned_text'] = cleaner.transform(df_cleaned['text'])
    
    return df_cleaned

# Create additional numeric features  

In [233]:
def Preprocess_num(df):
    df_new = df.copy()
    
    #for text column
    df_new["nb_words"] = df_new['text'].str.split().map(len)
    df_new["doc_length"] = df_new['text'].map(len)
    df_new["avg_word_length"] =  df_new['text'].apply(lambda x: np.mean([len(t) for t in x.split(' ')]) if len([len(t) for t in x.split(' ')]) > 0 else 0)
    
    #for cleaned_text column
    df_new["nb_words_cleaned"] = df_new['cleaned_text'].str.split().map(len)
    df_new["doc_length_cleaned"] = df_new['cleaned_text'].map(len)
    df_new["avg_word_length_cleaned"] =  df_new['cleaned_text'].apply(lambda x: np.mean([len(t) for t in x.split(' ')]) if len([len(t) for t in x.split(' ')]) > 0 else 0)
    
    return df_new

# Apply preprocessing for text feature 

In [234]:
tokenizer = WordPunctTokenizer()
stopwords_ = stopwords.words('english')
stemmer = SnowballStemmer('english')
regex_list_ =['hello', 'good afternoon', 'hi', 'good evening', 'good morning', 'kind regards', 'best regards', 'thanks']

cleaned_train_data = Preprocess_text(df, tokenizer=tokenizer, stemmer=None, lower=True, Remove_punc=True, stopwords=None, regex_list=regex_list_)
cleaned_train_data.head()

Unnamed: 0,text,class,cleaned_text
0,"Hello, Does it matter iff I use Visa or Master...",card,does it matter iff i use visa or mastercard
1,"Good afternoon, I just got refunded for my pur...",card,i just got refunded for my purchase over two ...
2,"Hello, I got billed ann extra pound! Thanks",others,i got billed ann extra pound
3,"Hi, How long does it take for a transfer to sh...",transfer,how long does it take for a transfer to show ...
4,"hi, When can I use money sent to my accountt? ...",transfer,when can i use money sent to my accountt


# Apply new numeric features 

In [214]:
cleaned_train_data = Preprocess_num(cleaned_train_data)
cleaned_train_data.head()

Unnamed: 0,text,class,cleaned_text,nb_words_cleaned,doc_length_cleaned,avg_word_length_cleaned
0,"Hello, Does it matter iff I use Visa or Master...",card,does it matter iff i use visa or mastercard,9,45,3.181818
1,"Good afternoon, I just got refunded for my pur...",card,i just got refunded for my purchase over two ...,11,56,3.384615
2,"Hello, I got billed ann extra pound! Thanks",others,i got billed ann extra pound,6,30,2.875
3,"Hi, How long does it take for a transfer to sh...",transfer,how long does it take for a transfer to show ...,14,63,3.0
4,"hi, When can I use money sent to my accountt? ...",transfer,when can i use money sent to my accountt,9,42,2.909091


# Spilit the data

In [235]:
train, test = train_test_split(cleaned_train_data, test_size=0.2, random_state=41)

# Define selectors to use it in the pipline 

In [236]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]
    

# Creating the piplines

In [237]:
text = Pipeline([
    ('selector', TextSelector(key='cleaned_text')),
    ('vectorizer', TfidfVectorizer())
])

nb_words = Pipeline([
    ('selector', NumberSelector(key='nb_words')),
    ('scaler', StandardScaler())
])

doc_length = Pipeline([
    ('selector', NumberSelector(key='doc_length')),
    ('scaler', StandardScaler())
])

avg_word_length = Pipeline([
    ('selector', NumberSelector(key='avg_word_length')),
    ('scaler', StandardScaler())
])

nb_words_cleaned = Pipeline([
    ('selector', NumberSelector(key='nb_words_cleaned')),
    ('scaler', StandardScaler())
])

doc_length_cleaned = Pipeline([
    ('selector', NumberSelector(key='doc_length_cleaned')),
    ('scaler', StandardScaler())
])

avg_word_length_cleaned = Pipeline([
    ('selector', NumberSelector(key='avg_word_length_cleaned')),
    ('scaler', StandardScaler())
])

feats = FeatureUnion([
    ('text', text),
    ('nb_words', nb_words),
    ('doc_length', doc_length),
    ('avg_word_length', avg_word_length),
    ('nb_words_cleaned', nb_words_cleaned),
    ('doc_length_cleaned', doc_length_cleaned),
    ('avg_word_length_cleaned', avg_word_length_cleaned)
])

pipline = Pipeline([
    ('text', text),
    ('clf', SVC(random_state=41))
])

In [238]:
pipline.fit(train, train['class'])


Pipeline(steps=[('text',
                 Pipeline(steps=[('selector', TextSelector(key='cleaned_text')),
                                 ('vectorizer', TfidfVectorizer())])),
                ('clf', SVC(random_state=41))])

In [239]:
y_preds = pipline.predict(test)
f1 = f1_score(test['class'], y_preds, average='macro')
f1

0.9116138190207869

# Hyperparameter tuning

In [249]:
param_grid = {'clf__C':np.arange(1,21),
             'clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
             'clf__kernel' : ['linear', 'rbf', 'poly']} 

In [250]:
clf = GridSearchCV(pipline, param_grid=param_grid, n_jobs=-1)

In [251]:
clf.fit(train, train['class'].values)
y_preds = clf.predict(test)
f1 = f1_score(test['class'], y_preds, average='macro')
f1

0.9231586521985096

# Submission

In [252]:
data_test = pd.read_csv('data/test_dataset.csv')
cleaned_test = Preprocess_text(data_test, tokenizer=tokenizer, lower=True, regex_list=regex_list_, Remove_punc=True, stemmer=None,stopwords=None)
#cleaned_test = Preprocess_num(cleaned_test)
cleaned_test.head()

Unnamed: 0,text,cleaned_text
0,How do I locate my card?,how do i locate my card
1,Why won't my card show up on the app?,why wont my card show up on the app
2,I need to know your exchange rates.,i need to know your exchange rates
3,I purchased something in a foreign currency bu...,i purchased sometng in a foreign currency but ...
4,My statement has a dollar I have been charged ...,my statement has a dollar i have been charged ...


In [253]:
cleaned_test.shape

(1042, 2)

In [254]:
y_test_preds = clf.predict(cleaned_test)
y_test_preds.shape

(1042,)

In [255]:
submission = pd.DataFrame({'class':y_test_preds})
submission

Unnamed: 0,class
0,card
1,card
2,others
3,card
4,others
...,...
1037,card
1038,others
1039,others
1040,others


In [256]:
submission.to_csv('data/sub_hypergamma_kernal_regex_list.csv', index=False)