In [137]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import re

from nltk.corpus import stopwords

import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

### define a function for auto ml

In [1]:
def auto_ML(model, Xtrain, Ytrain, Xtest, Ytest, folds= 10):
    model.fit(Xtrain, Ytrain)
    prediction= model.predict(Xtest)
    
    # Cross validation
    scores= cross_val_score(model, Xtrain, Ytrain, cv= folds)
    val_score= sum(scores/folds)
    
    print('Test prediction:\n{}'.format(prediction))
    print('-------------------------------------')
    print('Accuracy score: {}'.format(accuracy_score(Ytest, prediction)))
    print('-------------------------------------')
    print('confusion matrix:\n {}'.format(confusion_matrix(Ytest, prediction)))
    print('-------------------------------------')
    print('Cross Validation score: {}'.format(val_score))

In [127]:
train= pd.read_csv('/Users/akshatpant/Desktop/UMD/Sem 3/Comp Ling/Project/train.csv')
test= pd.read_csv('/Users/akshatpant/Desktop/UMD/Sem 3/Comp Ling/Project/test.csv')

In [128]:
print('training data:')
train.head()

training data:


Unnamed: 0,Question ID,Question Text,QANTA Scores,Answer,Sentence Position,IR_Wiki Scores,category
0,2390,it was founded as kart-hadasht meaning new tow...,"portugal:0.0748547872365, peter_the_great:0.05...",carthage,0,"carthage:4.46484593046, nelson_mandela:3.46234...",history
1,2390,it was founded as kart-hadasht meaning new tow...,"battle_of_shiloh:0.033232900633, henry_the_nav...",carthage,1,"carthage:6.0041482036, ancient_corinth:4.56760...",history
2,2390,it was founded as kart-hadasht meaning new tow...,"hannibal:0.0323164945129, carthage:0.027918372...",carthage,2,"carthage:6.40252637044, ancient_corinth:5.3011...",history
3,2390,it was founded as kart-hadasht meaning new tow...,"carthage:0.120553743404, hannibal:0.0341056783...",carthage,3,"carthage:6.74579468525, ancient_corinth:4.3816...",history
4,106503,the nations involved in it had allied immediat...,"battle_of_austerlitz:0.06581875769, treaty_of_...",congress_of_vienna,0,"congress_of_vienna:3.78992924717, crimean_war:...",history


In [129]:
print('testing data: ')
test.head()

testing data: 


Unnamed: 0,Question ID,Question Text,QANTA Scores,Sentence Position,IR_Wiki Scores,category
0,165921,this group 's power peaked in the 31st congres...,"ronald_reagan:0.0484481338785, grover_clevelan...",1,"free_soil_party:2.87910481765, equal_rights_am...",history
1,165935,charles lorencez led 6000 troops in a frontal ...,"battle_of_puebla:0.835245616606, francisco_i._...",1,"battle_of_puebla:5.78287582417, pancho_villa:3...",history
2,102459,in 1866 this mans nation under the command of ...,"vitus_bering:0.0350534508237, verdun:0.0287764...",0,"charles_de_gaulle:4.06032122098, napoleon_iii:...",history
3,4179,they became prominent after intervening in and...,"teutonic_knights:0.0501695382695, otto_von_bis...",4,"teutonic_knights:7.01499316116, golden_horde:4...",history
4,4186,its capital may have been established as early...,"songhai_empire:0.0943808941563, ming_dynasty:0...",2,"songhai_empire:6.52974316335, mali_empire:3.50...",history


### add new features

In [130]:
"""
Add new feature: Question length

Add new features: Extract max IR score and correscponding page from IR_Wiki Scores
Also calculate difference between highest and second highest IR score

Add target feature paren_match (true if wiki page and answer match)
Apply Weight of Evidence(WOE) encoding to category variable
"""

#Question length (training data)
lens= []
for i in range(0, train.shape[0]):
    lens.append(len(train.loc[i]['Question Text']))
lens
train['Quest len']= lens

#Question length (testing data)
lens= []
for i in range(0, test.shape[0]):
    lens.append(len(test.loc[i]['Question Text']))
lens
test['Quest len']= lens

######################################################################

# Max IR score and corresponding page (training data)
wiki_page= []
page_score= []
diff= []
for i in range(0, train.shape[0]):
    ans_score= {}
    for ii in train.loc[i]['IR_Wiki Scores'].split(', '):
        ans_score[ii.split(':')[0]]= float(ii.split(':')[1])
        
    
    page= sorted(ans_score, key= ans_score.get, reverse= True)[0]
    page2= sorted(ans_score, key= ans_score.get, reverse= True)[1]
    wiki_page.append(page)
    page_score.append(ans_score[page])
    diff.append(ans_score[page]- ans_score[page2])

train['Wiki page']= wiki_page
train['Page score']= page_score
train['Score difference']= diff

# Max IR score and corresponding page (testing data)
wiki_page= []
page_score= []
diff= []
for i in range(0, test.shape[0]):
    ans_score= {}
    for ii in test.loc[i]['IR_Wiki Scores'].split(', '):
        ans_score[ii.split(':')[0]]= float(ii.split(':')[1])
        
    
    page= sorted(ans_score, key= ans_score.get, reverse= True)[0]
    page2= sorted(ans_score, key= ans_score.get, reverse= True)[1]
    wiki_page.append(page)
    page_score.append(ans_score[page])
    diff.append(ans_score[page]- ans_score[page2])

test['Wiki page']= wiki_page
test['Page score']= page_score
test['Score difference']= diff

################################################################

# Target feature paren_match (training data). It is 1 if answer and wiki page match. 0 otherwise
train['paren_match']= 0

for i, row in train.iterrows():
    if row['Answer'] == row['Wiki page']:
        train.loc[i, 'paren_match']= 1
        


#################################################################

# WOE encoding

encoding= ce.WOEEncoder(cols= ['category', 'Wiki page'])
encoding.fit(train, train['paren_match'])
train_df=encoding.transform(train)


In [143]:
print('training data: ')
train_df.head()

training data: 


Unnamed: 0,Question ID,Question Text,QANTA Scores,Answer,Sentence Position,IR_Wiki Scores,category,Quest len,Wiki page,Page score,Score difference,paren_match
0,2390,it was founded as kart-hadasht meaning new tow...,"portugal:0.0748547872365, peter_the_great:0.05...",carthage,0,"carthage:4.46484593046, nelson_mandela:3.46234...",1.045459,-1.397314,0.854227,-0.616195,-0.47483,1
1,2390,it was founded as kart-hadasht meaning new tow...,"battle_of_shiloh:0.033232900633, henry_the_nav...",carthage,1,"carthage:6.0041482036, ancient_corinth:4.56760...",1.045459,-1.057254,0.854227,-0.58521,-0.448684,1
2,2390,it was founded as kart-hadasht meaning new tow...,"hannibal:0.0323164945129, carthage:0.027918372...",carthage,2,"carthage:6.40252637044, ancient_corinth:5.3011...",1.045459,-0.078929,0.854227,-0.577191,-0.468873,1
3,2390,it was founded as kart-hadasht meaning new tow...,"carthage:0.120553743404, hannibal:0.0341056783...",carthage,3,"carthage:6.74579468525, ancient_corinth:4.3816...",1.045459,0.533178,0.854227,-0.570281,-0.392808,1
4,106503,the nations involved in it had allied immediat...,"battle_of_austerlitz:0.06581875769, treaty_of_...",congress_of_vienna,0,"congress_of_vienna:3.78992924717, crimean_war:...",1.045459,-0.984011,0.854227,-0.629781,-0.479407,1


In [103]:
print('test dataset:')
test.head()

test dataset:


Unnamed: 0,Question ID,Question Text,QANTA Scores,Sentence Position,IR_Wiki Scores,category,Quest len,Wiki page,Page score
0,165921,this group 's power peaked in the 31st congres...,"ronald_reagan:0.0484481338785, grover_clevelan...",1,"free_soil_party:2.87910481765, equal_rights_am...",history,215,free_soil_party,2.879105
1,165935,charles lorencez led 6000 troops in a frontal ...,"battle_of_puebla:0.835245616606, francisco_i._...",1,"battle_of_puebla:5.78287582417, pancho_villa:3...",history,197,battle_of_puebla,5.782876
2,102459,in 1866 this mans nation under the command of ...,"vitus_bering:0.0350534508237, verdun:0.0287764...",0,"charles_de_gaulle:4.06032122098, napoleon_iii:...",history,196,charles_de_gaulle,4.060321
3,4179,they became prominent after intervening in and...,"teutonic_knights:0.0501695382695, otto_von_bis...",4,"teutonic_knights:7.01499316116, golden_horde:4...",history,817,teutonic_knights,7.014993
4,4186,its capital may have been established as early...,"songhai_empire:0.0943808941563, ming_dynasty:0...",2,"songhai_empire:6.52974316335, mali_empire:3.50...",history,508,songhai_empire,6.529743


### features

In [133]:
features= ['Wiki page', 'Quest len', 'Page score',  'category', 'Score difference']
target= ['paren_match']

### regularize data

In [138]:
scaler= StandardScaler()

scaler.fit(train_df[features].values)

train_df[features]= scaler.transform(train_df[features].values)

train_df.head()

Unnamed: 0,Question ID,Question Text,QANTA Scores,Answer,Sentence Position,IR_Wiki Scores,category,Quest len,Wiki page,Page score,Score difference,paren_match
0,2390,it was founded as kart-hadasht meaning new tow...,"portugal:0.0748547872365, peter_the_great:0.05...",carthage,0,"carthage:4.46484593046, nelson_mandela:3.46234...",1.045459,-1.397314,0.854227,-0.616195,-0.47483,1
1,2390,it was founded as kart-hadasht meaning new tow...,"battle_of_shiloh:0.033232900633, henry_the_nav...",carthage,1,"carthage:6.0041482036, ancient_corinth:4.56760...",1.045459,-1.057254,0.854227,-0.58521,-0.448684,1
2,2390,it was founded as kart-hadasht meaning new tow...,"hannibal:0.0323164945129, carthage:0.027918372...",carthage,2,"carthage:6.40252637044, ancient_corinth:5.3011...",1.045459,-0.078929,0.854227,-0.577191,-0.468873,1
3,2390,it was founded as kart-hadasht meaning new tow...,"carthage:0.120553743404, hannibal:0.0341056783...",carthage,3,"carthage:6.74579468525, ancient_corinth:4.3816...",1.045459,0.533178,0.854227,-0.570281,-0.392808,1
4,106503,the nations involved in it had allied immediat...,"battle_of_austerlitz:0.06581875769, treaty_of_...",congress_of_vienna,0,"congress_of_vienna:3.78992924717, crimean_war:...",1.045459,-0.984011,0.854227,-0.629781,-0.479407,1


### split training data

In [139]:
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df[target], test_size=0.20)

y_train= np.reshape(y_train.values, (y_train.shape[0], ))

### SVM

# svm_clf= SVC(kernel= 'rbf')

auto_ML(svm_clf, X_train, y_train, X_test, y_test, folds= 10)

In [57]:
"""
prepare question text for input to lstm
get the length of longest sentence
pad other sentences

"""
long_sent= max(max(train['Quest len'].values), max(test['Quest len'].values))

long_sent


#print(max(train['Quest len'].values), max(test['Quest len'].values))

1088

In [45]:
"""
prepare vocabulary from text
prepare word_to_index dictionary
set <pad> and <unk> value to 0
normalize text before adding to vocab
remove stopwords
"""

regex= re.compile(r"\b(\w*['\w]*[\w]*)[^\w]*")

vocab= {'<PAD>': 0, '<UNK>': 0}
word_to_ix= {}

for i, row in test.iterrows():
    sent= train.loc[i, 'Question Text'].lower()
    for word in regex.findall(sent):
        if word not in vocab and len(word)> 1 and word not in stopwords.words('english'):
            vocab[word]= len(vocab)- 1
    train.loc[i, 'Question Text']= ' '.join(regex.findall(sent))
    
vocab


{'<PAD>': 0,
 '<UNK>': 0,
 'founded': 1,
 'kart': 2,
 'hadasht': 3,
 'meaning': 4,
 'new': 5,
 'town': 6,
 'suburb': 7,
 'world': 8,
 'capital': 9,
 'byrsu': 10,
 'ancient': 11,
 'citadel': 12,
 'hill': 13,
 'overlooking': 14,
 'sea': 15,
 'plundered': 16,
 'burned': 17,
 'conquerors': 18,
 'forbid': 19,
 'human': 20,
 'habitation': 21,
 'site': 22,
 'later': 23,
 'captured': 24,
 'vandals': 25,
 'byzantine_empire': 26,
 '705': 27,
 'arabs': 28,
 'name': 29,
 'city': 30,
 'may': 31,
 'dido': 32,
 'whose': 33,
 'wars': 34,
 'rome': 35,
 'brought': 36,
 'destruction': 37,
 '146': 38,
 'bc': 39,
 'nations': 40,
 'involved': 41,
 'allied': 42,
 'immediately': 43,
 'treaty': 44,
 'chaumont': 45,
 'one': 46,
 'figures': 47,
 'continued': 48,
 'reforms': 49,
 'karl': 50,
 'vom': 51,
 'stein': 52,
 'due': 53,
 'disagreement': 54,
 'italys': 55,
 'settlement': 56,
 'spain': 57,
 'refused': 58,
 'sign': 59,
 'final': 60,
 'act': 61,
 'switzerland': 62,
 'granted': 63,
 'constitution': 64,
 'swed

In [46]:
train

Unnamed: 0,Question ID,Question Text,QANTA Scores,Answer,Sentence Position,IR_Wiki Scores,category,Quest len,Wiki page,Page score
0,2390,it was founded as kart hadasht meaning new tow...,"portugal:0.0748547872365, peter_the_great:0.05...",carthage,0,"carthage:4.46484593046, nelson_mandela:3.46234...",history,90,carthage,4.464846
1,2390,it was founded as kart hadasht meaning new tow...,"battle_of_shiloh:0.033232900633, henry_the_nav...",carthage,1,"carthage:6.0041482036, ancient_corinth:4.56760...",history,155,carthage,6.004148
2,2390,it was founded as kart hadasht meaning new tow...,"hannibal:0.0323164945129, carthage:0.027918372...",carthage,2,"carthage:6.40252637044, ancient_corinth:5.3011...",history,342,carthage,6.402526
3,2390,it was founded as kart hadasht meaning new tow...,"carthage:0.120553743404, hannibal:0.0341056783...",carthage,3,"carthage:6.74579468525, ancient_corinth:4.3816...",history,459,carthage,6.745795
4,106503,the nations involved in it had allied immediat...,"battle_of_austerlitz:0.06581875769, treaty_of_...",congress_of_vienna,0,"congress_of_vienna:3.78992924717, crimean_war:...",history,169,congress_of_vienna,3.789929
5,106503,the nations involved in it had allied immediat...,"treaty_of_brest-litovsk:0.0290714511706, congr...",congress_of_vienna,1,"congress_of_vienna:3.91160914684, napoleon_iii...",history,302,congress_of_vienna,3.911609
6,106503,the nations involved in it had allied immediat...,"congress_of_vienna:0.18004693222, teutonic_kni...",congress_of_vienna,2,"congress_of_vienna:7.0196528341, thirty_years'...",history,437,congress_of_vienna,7.019653
7,106503,the nations involved in it had allied immediat...,"congress_of_vienna:0.224345119428, teutonic_kn...",congress_of_vienna,3,"congress_of_vienna:6.65354658301, napoleon_iii...",history,492,congress_of_vienna,6.653547
8,165897,at age 22 he became the youngest captain in th...,"benito_mussolini:0.343499429074, hannibal:0.09...",francisco_franco,0,"seminole_wars:3.40736769424, francisco_franco:...",history,61,seminole_wars,3.407368
9,165897,at age 22 he became the youngest captain in th...,"emilio_aguinaldo:0.0391431785257, napoleon_iii...",francisco_franco,1,"francisco_franco:5.81715654207, spanish_civil_...",history,229,francisco_franco,5.817157


In [6]:
"""
Create LSTM network for the text.
Combine other features into the output of LSTM.

"""

class LSTM_special(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, add_dim, out_dim, num_layers= 1):
        super(LSTM_special, self).__init__()
        self.input_dim= input_dim
        self.hidden_dim= hidden_dim
        self.batch_size= batch_size
        self.add_dim= add_dim
        self.out_dim= out_dim
        self.num_layers= num_layers
        
        # Define the LSTM layer
        self.lstm= nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
        
        # Define the linear layer that maps from (hidden_dim + add_dim) to out_dim
        self.linear= nn.Linear(self.hidden_dim+self.add_dim, self.out_dim)
        
        # Define the non-linearity that converts to probability
        self.softmax= nn.Softmax()
        
    def init_hidden(self):
        """
        Initialize the hidden state (h0, c0)
        
        Before we've done anything, we dont have any hidden state.
        Refer to the Pytorch documentation to see exactly
        why they have this dimensionality.
        The axes semantics are (num_layers, minibatch_size, hidden_dim)
        """
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
               torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
        
    def forward(self, sequence, add_features):
        """
        forward pass through LSTM layer
        hidden to output space mapping by Linear layer
        lstm_out shape: [seq_len/input_len, batch_size, hidden_dim]
        self.hidden shape= (a, b) where a & b both have shape: [num_layers, batch_size, hidden_dim]
        """
        lstm_out, self.hidden= self.lstm(sequence.view(seq_len, self.batch_size, -1), self.hidden)
        
        """
        Take the output from the last layer of the LSTM and 
        concatenate the additional features to them.
        Map them to output space.
        Apply non linearity like softmax
        """
        # get the output from the last timestep
        lstm_out= lstm_out[-1].view(self.batch_size, -1)
        
        # concatenate additional features to lstm output
        new_features= torch.cat((lstm_out, add_features.view(self.batch_size, -1)), 1)
        
        # map to output space
        y_pred= self.linear(new_features)
        
        # apply non linearity
        output= self.softmax(y_pred)
        
        
        return output
        
        
        
        
