In [21]:
import pandas as pd

In [22]:
#loading the dataset
df = pd.read_csv("Train.csv")

In [23]:
missing_values = df.isnull().sum()
print(missing_values)

id             0
Sentence       0
Aspect Term    0
polarity       0
from           0
to             0
dtype: int64


In [24]:
print(df.head())

     id                                           Sentence     Aspect Term  \
0  2339  I charge it at night and skip taking the cord ...            cord   
1  2339  I charge it at night and skip taking the cord ...    battery life   
2  1316  The tech guy then said the service center does...  service center   
3  1316  The tech guy then said the service center does...    "sales" team   
4  1316  The tech guy then said the service center does...        tech guy   

   polarity  from   to  
0   neutral    41   45  
1  positive    74   86  
2  negative    27   41  
3  negative   109  121  
4   neutral     4   12  


In [25]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2358 entries, 0 to 2357
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           2358 non-null   int64 
 1   Sentence     2358 non-null   object
 2   Aspect Term  2358 non-null   object
 3   polarity     2358 non-null   object
 4   from         2358 non-null   int64 
 5   to           2358 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 110.7+ KB
None


In [26]:
print(df.describe())

                id         from           to
count  2358.000000  2358.000000  2358.000000
mean   1627.559372    49.363020    58.785411
std     910.748883    44.026428    44.328897
min       3.000000     0.000000     3.000000
25%     834.250000    17.000000    26.000000
50%    1684.000000    38.000000    48.000000
75%    2446.000000    71.000000    80.000000
max    3085.000000   349.000000   356.000000


In [27]:
df = df.drop(columns=['id'])
print(df.head())

                                            Sentence     Aspect Term  \
0  I charge it at night and skip taking the cord ...            cord   
1  I charge it at night and skip taking the cord ...    battery life   
2  The tech guy then said the service center does...  service center   
3  The tech guy then said the service center does...    "sales" team   
4  The tech guy then said the service center does...        tech guy   

   polarity  from   to  
0   neutral    41   45  
1  positive    74   86  
2  negative    27   41  
3  negative   109  121  
4   neutral     4   12  


In [28]:
#printing unique value counts from polarity
df['polarity'].value_counts()


Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,987
negative,866
neutral,460
conflict,45


In [29]:
#dropping observations with polarity == conflict as not necessary

df = df[df['polarity'] != 'conflict']


In [30]:
#count the value counts for every class

df['polarity'].value_counts()

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,987
negative,866
neutral,460


# Preproccesing of text

In [31]:
#Importing necessary libraries
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from nltk.tokenize import word_tokenize

# Lowercasing

df['Sentence'] = df['Sentence'].str.lower()
df['Aspect Term'] = df['Aspect Term'].str.lower()


#Removing stopwords
stop_words = set(stopwords.words('english'))
df['Sentence'] = df['Sentence'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
df['Aspect Term'] = df['Aspect Term'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

#Lemmatization
lemmatizer = WordNetLemmatizer()
df['Sentence'] = df['Sentence'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))
df['Aspect Term'] = df['Aspect Term'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

#Removing punctuation and special characters
df['Sentence'] = df['Sentence'].str.replace(r'[^\w\s]', '', regex=True)
df['Aspect Term'] = df['Aspect Term'].str.replace(r'[^\w\s]', '', regex=True)

#Removing extra white spaces
df['Sentence'] = df['Sentence'].str.replace(r'\s+', ' ', regex=True)
df['Aspect Term'] = df['Aspect Term'].str.replace(r'\s+', ' ', regex=True)

#trailing white spaces
df['Sentence'] = df['Sentence'].str.strip()
df['Aspect Term'] = df['Aspect Term'].str.strip()

# #tokenization
# df['Sentence_Tokens'] = df['Sentence'].apply(word_tokenize)
# df['Aspect_Tokens'] = df['Aspect Term'].apply(word_tokenize)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
print(df['Sentence'].head())

0      charge night skip taking cord good battery life
1      charge night skip taking cord good battery life
2    tech guy said service center 1to1 exchange dir...
3    tech guy said service center 1to1 exchange dir...
4    tech guy said service center 1to1 exchange dir...
Name: Sentence, dtype: object


In [33]:
df.head(10)

Unnamed: 0,Sentence,Aspect Term,polarity,from,to
0,charge night skip taking cord good battery life,cord,neutral,41,45
1,charge night skip taking cord good battery life,battery life,positive,74,86
2,tech guy said service center 1to1 exchange dir...,service center,negative,27,41
3,tech guy said service center 1to1 exchange dir...,sale team,negative,109,121
4,tech guy said service center 1to1 exchange dir...,tech guy,neutral,4,12
5,high quality killer gui extremely stable highl...,quality,positive,14,21
6,high quality killer gui extremely stable highl...,gui,positive,36,39
7,high quality killer gui extremely stable highl...,application,positive,118,130
8,high quality killer gui extremely stable highl...,use,positive,143,146
9,easy start overheat much laptop,start,positive,8,16



# Bag of Words


In [34]:
#importing necessary libraries for BOW
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import hstack

# bag of words for sentence
vectorSentence = CountVectorizer()
sentenceBow = vectorSentence.fit_transform(df['Sentence'])

# bog of words for aspect
vectorAspect = CountVectorizer()
aspectBow = vectorAspect.fit_transform(df['Aspect Term'])

#combining both generated BOW of sentence and aspect
X_combined = hstack([sentenceBow, aspectBow])


#picking the top 100 features using chi square
y = df['polarity'].astype('category').cat.codes
chi2_selector = SelectKBest(chi2, k=100)
#selecting top 100 using chi sqare which will be used as input for training the model on diff classifiers
XTop100 = chi2_selector.fit_transform(X_combined, y)



In [35]:
#importing the necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# loading random forest classifier and training it on the input generated from top k sleected features of BOW
rfClassifier = RandomForestClassifier()
rfScores = cross_val_score(rfClassifier, XTop100, y, cv=10)


#similar to random forest trainig it on SVM classifier
svmClassifier = SVC(kernel='linear')
svmScores = cross_val_score(svmClassifier, XTop100, y, cv=10)

# using the Decision tree classifier
dtClassifier = DecisionTreeClassifier()
dtScores = cross_val_score(dtClassifier, XTop100, y, cv=10)


#printing the accuracies for every classifier
print(f"Random Forest 10-fold CV Accuracy: {rfScores}")
print(f"Mean Random Forest Accuracy: {rfScores.mean()}")

print(f"SVM 10-fold CV Accuracy: {svmScores}")
print(f"Mean SVM Accuracy: {svmScores.mean()}")

print(f"Decision Tree 10-fold CV Accuracy: {dtScores}")
print(f"Mean Decision Tree Accuracy: {dtScores.mean()}")


Random Forest 10-fold CV Accuracy: [0.60344828 0.63362069 0.65086207 0.63636364 0.57575758 0.63636364
 0.57142857 0.61471861 0.63636364 0.59307359]
Mean Random Forest Accuracy: 0.6152000298552023
SVM 10-fold CV Accuracy: [0.5862069  0.62931034 0.68965517 0.61038961 0.5974026  0.62337662
 0.58874459 0.61471861 0.62337662 0.61904762]
Mean SVM Accuracy: 0.618222869084938
Decision Tree 10-fold CV Accuracy: [0.60775862 0.56896552 0.63362069 0.60606061 0.55411255 0.63203463
 0.57575758 0.5974026  0.61038961 0.57142857]
Mean Decision Tree Accuracy: 0.5957530974772354


# GloVe


In [36]:
import pandas as pd
import numpy as np
from gensim.downloader import load
from nltk.tokenize import word_tokenize

#loading GloVe model

model = load('glove-wiki-gigaword-300')

In [37]:

#importing necessary libraries
import pandas as pd
import numpy as np
from gensim.downloader import load
from nltk.tokenize import word_tokenize
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



#tokenizing sentence and aspect features before creating its embeddings
df['Sentence_Tokens'] = df['Sentence'].apply(word_tokenize)
df['Aspect_Tokens'] = df['Aspect Term'].apply(word_tokenize)


#function to generate sentence embeddings
def getSentenceEmbedding(tokens):
  # if token not present in GloVe
    if len(tokens) == 0:
        return np.zeros(300)
    else:
        valid_embeddings = [model[w] for w in tokens if w in model]
        return np.mean(valid_embeddings, axis=0) if valid_embeddings else np.zeros(300)


df['Sentence_Embedding'] = df['Sentence_Tokens'].apply(getSentenceEmbedding)


#function to generate aspect embeddings
def getAspectEmbedding(tokens):
   # if token not present in GloVe
    if len(tokens) == 0:
        return np.zeros(300)
    else:
        valid_embeddings = [model[w] for w in tokens if w in model]
        return np.mean(valid_embeddings, axis=0) if valid_embeddings else np.zeros(300)


df['Aspect_Embedding'] = df['Aspect_Tokens'].apply(getAspectEmbedding)


XSentence = np.vstack(df['Sentence_Embedding'].values)
XAspect = np.vstack(df['Aspect_Embedding'].values)
#combining generated embeddings using GloVe of both aspect and sentence
XCombined = np.hstack((XSentence, XAspect))


y = df['polarity'].astype('category').cat.codes


# loading random forest classifier and training it on the input generated from top k sleected features of BOW
rfClassifier = RandomForestClassifier()
rfScores = cross_val_score(rfClassifier, XCombined, y, cv=10)


#similar to random forest trainig it on SVM classifier
svmClassifier = SVC(kernel='linear')
svmScores = cross_val_score(svmClassifier, XCombined, y, cv=10)

# using the Decision tree classifier
dtClassifier = DecisionTreeClassifier()
dtScores = cross_val_score(dtClassifier, XCombined, y, cv=10)


#printing the accuracies for every classifier
print(f"Random Forest 10-fold CV Accuracy: {rfScores}")
print(f"Mean Random Forest Accuracy: {rfScores.mean()}")

print(f"SVM 10-fold CV Accuracy: {svmScores}")
print(f"Mean SVM Accuracy: {svmScores.mean()}")

print(f"Decision Tree 10-fold CV Accuracy: {dtScores}")
print(f"Mean Decision Tree Accuracy: {dtScores.mean()}")


Random Forest 10-fold CV Accuracy: [0.57758621 0.56896552 0.69396552 0.61471861 0.60606061 0.62337662
 0.66233766 0.66666667 0.62770563 0.61904762]
Mean Random Forest Accuracy: 0.6260430661292731
SVM 10-fold CV Accuracy: [0.63793103 0.61637931 0.61637931 0.62337662 0.5974026  0.61904762
 0.61471861 0.63203463 0.62770563 0.60606061]
Mean SVM Accuracy: 0.6191035975518735
Decision Tree 10-fold CV Accuracy: [0.53448276 0.48275862 0.52586207 0.47186147 0.47619048 0.53246753
 0.51948052 0.47619048 0.53246753 0.46753247]
Mean Decision Tree Accuracy: 0.5019293924466337


# USE

In [38]:
#importing necessary libraries
! pip install tensorflow tensorflow_hub



In [39]:

#importing necessary libraries
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

#loading the Universal Sentence Encoder
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


def getUseEmbedding(text):
    return model([text]).numpy().flatten()

#generating sentence and aspect embeddings using USE model
df['SentenceEmbedding'] = df['Sentence'].apply(getUseEmbedding)
df['AspectEmbedding'] = df['Aspect Term'].apply(getUseEmbedding)


XSentence = np.vstack(df['SentenceEmbedding'].values)
XAspect = np.vstack(df['AspectEmbedding'].values)
#similar to previous methods combining sentence and aspect embeddings
XCombined = np.hstack((XSentence, XAspect))


y = df['polarity'].astype('category').cat.codes


rfClassifier = RandomForestClassifier()
rfScores = cross_val_score(rfClassifier, XCombined, y, cv=10)


#similar to random forest trainig it on SVM classifier
svmClassifier = SVC(kernel='linear')
svmScores = cross_val_score(svmClassifier, XCombined, y, cv=10)

# using the Decision tree classifier
dtClassifier = DecisionTreeClassifier()
dtScores = cross_val_score(dtClassifier, XCombined, y, cv=10)


#printing the accuracies for every classifier
print(f"Random Forest 10-fold CV Accuracy: {rfScores}")
print(f"Mean Random Forest Accuracy: {rfScores.mean()}")

print(f"SVM 10-fold CV Accuracy: {svmScores}")
print(f"Mean SVM Accuracy: {svmScores.mean()}")

print(f"Decision Tree 10-fold CV Accuracy: {dtScores}")
print(f"Mean Decision Tree Accuracy: {dtScores.mean()}")

Random Forest 10-fold CV Accuracy: [0.64224138 0.56896552 0.69827586 0.67532468 0.65800866 0.66233766
 0.67532468 0.66233766 0.64502165 0.6017316 ]
Mean Random Forest Accuracy: 0.6489569338707268
SVM 10-fold CV Accuracy: [0.63793103 0.65086207 0.7112069  0.64502165 0.6969697  0.64935065
 0.68398268 0.61904762 0.70995671 0.65367965]
Mean SVM Accuracy: 0.6658008658008658
Decision Tree 10-fold CV Accuracy: [0.52155172 0.56465517 0.54310345 0.50649351 0.52380952 0.6017316
 0.54978355 0.4978355  0.54978355 0.52380952]
Mean Decision Tree Accuracy: 0.538255709807434
