In [14]:
import pandas as pd
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize,wordpunct_tokenize,sent_tokenize 
from nltk.corpus import stopwords 
import string
import re

In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, NuSVC,SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, f1_score,precision_score,recall_score, classification_report
from sklearn.metrics import precision_recall_fscore_support

In [16]:
#read the data and define the columns
df1 = pd.read_csv('./data-1_train.csv')
df1.columns = ['example_id', 'text', 'aspect_term', 'term_location', 'class']



In [17]:
#testing n gram

In [18]:
data = df1.copy()
data2 = data.copy()
vec = TfidfVectorizer(min_df = 0.00125, max_df = 0.7, sublinear_tf=True, use_idf=True, stop_words=u'english', analyzer= 'word', ngram_range=(1,5),lowercase=True)
X = vec.fit_transform(data['text'])
svm = LinearSVC(C=1.2)
pred_weights = cross_val_predict(svm,X,data['class'],cv = 10)
np.mean(pred_weights == data['class'])

0.7176577394462097

In [19]:
#intitalize count vectorizer
countVectorizer = CountVectorizer()
#get the list of vocab
count_matrix = countVectorizer.fit_transform(df1['text'])
vocab = list(countVectorizer.vocabulary_.keys())
len(vocab)

3246

In [20]:
df1.head()

Unnamed: 0,example_id,text,aspect_term,term_location,class
0,2333_1,Obviously one of the most important features o...,human interface,69--84,0
1,1805_1,Good for every day computing and web browsing.,every day computing,9--28,1
2,2782_2,while the keyboard itself is alright[comma] th...,mouse command buttons,115--136,-1
3,1385_0,Again[comma] the same problem[comma] the right...,right speaker,29--42,-1
4,1423_0,My problem was with DELL Customer Service.,DELL Customer Service,20--41,-1


In [21]:
def processData(data):
    #replace [comma] with a ','
    data = data.replace("[comma]",",")
    
    #tokenize with punctuations
    data = " ".join(wordpunct_tokenize(data))
    
    #remove punctuations
    nopunc = [char for char in data if char not in string.punctuation]
    
    #join the words and remove stopswords
    data = "".join(nopunc)
    data = [text for text in data.strip().split() if text not in set(stopwords.words('english'))]  
    #message = [snowball.stem(w) for w in message]
    
    #convert the text into word tokens
    data = " ".join(data)
    words = word_tokenize(data)
    
    #lemmatize using wordNetLemmatizer
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    data = [lemmatizer.lemmatize(word) for word in words]
    
    #return processed data
    return data



In [22]:
#process text and aspect term columns of input data
df1['clean_text'] = df1['text'].apply(processData)
df1['clean_aspect_term'] = df1['aspect_term'].apply(processData)

In [23]:
df1.head()

Unnamed: 0,example_id,text,aspect_term,term_location,class,clean_text,clean_aspect_term
0,2333_1,Obviously one of the most important features o...,human interface,69--84,0,"[Obviously, one, important, feature, computer,...","[human, interface]"
1,1805_1,Good for every day computing and web browsing.,every day computing,9--28,1,"[Good, every, day, computing, web, browsing]","[every, day, computing]"
2,2782_2,while the keyboard itself is alright[comma] th...,mouse command buttons,115--136,-1,"[keyboard, alright, plate, around, cheap, plas...","[mouse, command, button]"
3,1385_0,Again[comma] the same problem[comma] the right...,right speaker,29--42,-1,"[Again, problem, right, speaker, work]","[right, speaker]"
4,1423_0,My problem was with DELL Customer Service.,DELL Customer Service,20--41,-1,"[My, problem, DELL, Customer, Service]","[DELL, Customer, Service]"


In [24]:
#method to assign weights
def assign_weights(data):
    
    #get the te
    text = data[0]
    aspect =data[1]
    
    #corner case to check aspect terms are lesser or equal to the text
    if set(aspect) <= set(text):
        
        #to store left and right weights
        leftWeights = rightWeights = []
        
        #find the starting point of the first aspect term word
        aspect_start = [i for i, word in enumerate(text) if word == aspect[0]]
        
        #find the start and end indices of the aspect
        for pos in (aspect_start):
            if  text[(pos + len(aspect) - 1)] == aspect[-1]:
                start_index = pos
                end_index = pos + len(aspect) - 1
                break
                
        #assign weights to each word moving left and right of the aspect term         
        if (end_index - start_index) == len(aspect) - 1:
            #get the left side and the right side of the text from the aspect
            left_text = text[:start_index]
            right_text = text[end_index+1:]
            
            #generate left and right weights based on a well known strategy
            leftWeights = [1/i for i in range(len(left_text),0,-1) if len(left_text) != 0]
            rightWeights = [1/i for i in range(1,len(right_text)+1) if len(right_text) != 0]
         
        #find the total weights by adding a constant
        tot_weights = leftWeights + [2]*len(aspect) + rightWeights
        
        #return the dict of text and its weights
        return dict(zip(text,tot_weights))
    else: 
        return np.nan


In [25]:
#assign weights to the data
df1['element_weights'] = df1[['clean_text','clean_aspect_term']].apply(assign_weights, axis = 1)

#drop all nan values
df1 = df1.dropna()

#create an empty df with zeros
df2 = pd.DataFrame(np.zeros((len(df1),len(vocab))),columns=vocab)

df1.head()

Unnamed: 0,example_id,text,aspect_term,term_location,class,clean_text,clean_aspect_term,element_weights
0,2333_1,Obviously one of the most important features o...,human interface,69--84,0,"[Obviously, one, important, feature, computer,...","[human, interface]","{'Obviously': 0.2, 'one': 0.25, 'important': 0..."
1,1805_1,Good for every day computing and web browsing.,every day computing,9--28,1,"[Good, every, day, computing, web, browsing]","[every, day, computing]","{'Good': 1.0, 'every': 2, 'day': 2, 'computing..."
2,2782_2,while the keyboard itself is alright[comma] th...,mouse command buttons,115--136,-1,"[keyboard, alright, plate, around, cheap, plas...","[mouse, command, button]","{'keyboard': 0.1, 'alright': 0.111111111111111..."
3,1385_0,Again[comma] the same problem[comma] the right...,right speaker,29--42,-1,"[Again, problem, right, speaker, work]","[right, speaker]","{'Again': 0.5, 'problem': 1.0, 'right': 2, 'sp..."
4,1423_0,My problem was with DELL Customer Service.,DELL Customer Service,20--41,-1,"[My, problem, DELL, Customer, Service]","[DELL, Customer, Service]","{'My': 0.5, 'problem': 1.0, 'DELL': 2, 'Custom..."


In [26]:
# assign the calculated weights to the corresponding text elements of df2
for row in range(len(df1)):
    for key,value in df1.iloc[row]['element_weights'].items():
        df2.iloc[row][key] = value
        
#feature extraction - transform a count matrix to a normalized tf-idf representation
tfidf= TfidfTransformer().fit_transform(df2)


In [27]:
## SVM 

In [28]:
svm = LinearSVC(C=1.2)
pred_SVM = cross_val_predict(svm,tfidf,df1['class'],cv = 10)

np.mean(pred_SVM == df1['class'])

0.7033166742389823

In [29]:
svm.fit(tfidf, df1['class'])

LinearSVC(C=1.2, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [30]:
print(precision_recall_fscore_support(df1['class'], pred_SVM, labels=[-1,0,1]))

print("\n Classification Report \n ", classification_report(pred_SVM,df1['class']))

(array([0.68120805, 0.55555556, 0.78251599]), array([0.73639661, 0.47018349, 0.78251599]), array([0.70772807, 0.50931677, 0.78251599]), array([827, 436, 938], dtype=int64))

 Classification Report 
               precision    recall  f1-score   support

         -1       0.74      0.68      0.71       894
          0       0.47      0.56      0.51       369
          1       0.78      0.78      0.78       938

avg / total       0.71      0.70      0.71      2201



In [31]:
# Decision tree classifier

In [32]:
vec = TfidfVectorizer(min_df = 0.00125, max_df = 0.7, sublinear_tf=True, use_idf=True, stop_words=u'english', analyzer= processData, ngram_range=(1,5),lowercase=True)

In [33]:
X = vec.fit_transform(df1['text'])

In [34]:
Y = df1['class']

In [35]:
DT = DecisionTreeClassifier(max_depth=10).fit(X,Y)

In [36]:
pred = cross_val_predict(DT,X,Y,cv=10)

In [37]:
np.mean(Y==pred)

0.5402089959109496

In [38]:
DT = DecisionTreeClassifier(max_depth=10).fit(X,Y)
pred = cross_val_predict(DT,X,Y,cv=10)
print(precision_recall_fscore_support(Y, pred, labels=[-1,0,1]))

print("\n Classification Report \n ", classification_report(pred,Y))

(array([0.45732052, 0.65957447, 0.90649351]), array([0.97823458, 0.07110092, 0.37206823]), array([0.62326656, 0.12836439, 0.52758881]), array([827, 436, 938], dtype=int64))

 Classification Report 
               precision    recall  f1-score   support

         -1       0.98      0.46      0.62      1769
          0       0.07      0.66      0.13        47
          1       0.37      0.91      0.53       385

avg / total       0.85      0.54      0.60      2201



In [39]:
# Multinomial NB

In [40]:
clf_NB = MultinomialNB().fit(X,Y)

In [41]:
pred_NB= cross_val_predict(clf_NB,X,Y,cv=10)

In [42]:
np.mean(Y==pred_NB)

0.6937755565651976

In [43]:
print(precision_recall_fscore_support(Y, pred_NB, labels=[-1,0,1]))

print("\n Classification Report \n ", classification_report(pred_NB,Y))

(array([0.68047983, 0.68571429, 0.70541958]), array([0.75453446, 0.22018349, 0.86034115]), array([0.71559633, 0.33333333, 0.77521614]), array([827, 436, 938], dtype=int64))

 Classification Report 
               precision    recall  f1-score   support

         -1       0.75      0.68      0.72       917
          0       0.22      0.69      0.33       140
          1       0.86      0.71      0.78      1144

avg / total       0.78      0.69      0.72      2201



In [44]:
# Random forest

In [45]:
clf_RFC = RandomForestClassifier(n_estimators=100,max_depth=200)
pred_RF = cross_val_predict(clf_RFC,df2,df1['class'],cv = 10)
np.mean(pred_RF == df1['class'])

0.716492503407542

In [46]:
print(precision_recall_fscore_support(df1['class'], pred_RF, labels=[-1,0,1]))

print("\n Classification Report \n ", classification_report(pred_RF,df1['class']))

(array([0.66633858, 0.6490566 , 0.79130435]), array([0.81862152, 0.39449541, 0.7761194 ]), array([0.73467173, 0.49072753, 0.78363832]), array([827, 436, 938], dtype=int64))

 Classification Report 
               precision    recall  f1-score   support

         -1       0.82      0.67      0.73      1016
          0       0.39      0.65      0.49       265
          1       0.78      0.79      0.78       920

avg / total       0.75      0.72      0.73      2201



In [47]:
# Testing

In [48]:
#define the columns required
testcols=['example_id','text','aspect_term','term_location']
df_test = pd.read_csv('./Data-2_test.csv',skipinitialspace=True, usecols=testcols)

df_test.columns = ['example_id','text','aspect_term','term_location']
countVectorizer = CountVectorizer()
count_matrix = countVectorizer.fit_transform(df_test['text'])
df_test['text'] = df_test['text'].apply(processData)
df_test['processed_AT'] = df_test['aspect_term'].apply(processData)
df_test['element_weights'] = df_test[['text','processed_AT']].apply(assign_weights, axis = 1)
df_test = df_test.dropna()
df2_test = pd.DataFrame(np.zeros((len(df1),len(vocab))),columns=vocab)

#assign weights and add it to the dict
for row in range(len(df_test)):    
    for key,value in df_test.iloc[row]['element_weights'].items():
        df2_test.iloc[row][key] = value
tfidf= TfidfTransformer().fit_transform(df2_test)

#do a single pass over the test data using existing classifier 
pred_SVM = svm.predict(df2_test)
res = list()
file = open('KruneetKumar_Patel_Vishwas_SreevalliRamamohan_Data-2.txt','w')

#zip example ids with their predictions and write it to a file
for x,y in zip(list(df_test['example_id']),pred_SVM):
    res.append(str(x) +";;"+str(y))
for i in res:
    file.write("%s\n"%i)
file.close()

In [49]:
# KruneetKumar_Patel_Vishwas_SreevalliRamamohan_Data-2.txt
# KruneetKumar_Patel_Vishwas_SreevalliRamamohan_Data-1.txt

In [50]:
i = 0 
with open('KruneetKumar_Patel_Vishwas_SreevalliRamamohan_Data-2.txt','r') as f:
    x = f.readlines()
    for y in x:
        i += 1 
print(i)


1120


In [51]:
len(df_test)

1120

In [52]:
# Experimenting

In [53]:
data2 = df1.copy()
df1['clean_text']= " ".join(df1['clean_text'])
vec = TfidfVectorizer(min_df = 0.00125, max_df = 0.7, sublinear_tf=True, use_idf=True, stop_words=u'english', analyzer= 'word', ngram_range=(1,5),lowercase=True)
X = vec.fit_transform(data['text'])
svm = LinearSVC(C=1.2)
pred_weights = cross_val_predict(svm,X,data['class'],cv = 10)
np.mean(pred_weights == data['class'])

TypeError: sequence item 0: expected str instance, list found

In [None]:
df1['sentText'] = [" ".join(x) for x in df1['clean_text']]

In [None]:
df1['sentAspect'] = [" ".join(c) for c in df1['clean_aspect_term']]

In [None]:
vec = TfidfVectorizer(min_df = 0.00125, max_df = 0.5, sublinear_tf=True, use_idf=True, stop_words=u'english', analyzer= 'word', ngram_range=(1,10),lowercase=True)

In [None]:
X = vec.fit_transform(df1['sentText'])

In [None]:
len(df1['class'])

In [None]:
svm = LinearSVC(C=1.2)
pred_weights = cross_val_predict(svm, X ,df1['class'],cv = 10)
len(pred_weights)

In [None]:
np.mean(pred_weights == df1['class'])