## Import

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import re
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# read the dataset

In [31]:
df = pd.read_csv('Laptop_Train_v2.csv')
df = df[df['polarity'] != 'conflict']
df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2352,2272,We also use Paralles so we can run virtual mac...,Windows 7 Home Premium,neutral,80,102
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,positive,130,136


# remove unnecessary stop word

In [32]:
from nltk.corpus import stopwords
all_stop_words = set(stopwords.words("english"))

important_stop_words = [word for word in all_stop_words if ("n't" in word or "no" in word)]
stop_words = [word for word in all_stop_words if word not in important_stop_words]

def remove_stopwords(text):
    text = [word for word in text if word not in stop_words]
    return text

# lemmatization


In [33]:
lemmatizer = WordNetLemmatizer()

def lemmatization(text):

    text = [lemmatizer.lemmatize(word) for word in text]

    return text

# preprocessing pipeline

In [6]:
def preprocessing(text):

    text = text.lower()
    # text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^a-z_]+$', '', text)
    text = word_tokenize(text)
    text = remove_stopwords(text)
    text = lemmatization(text)
    text = " ".join(text)

    return text

In [34]:
from tqdm import tqdm

tqdm.pandas()

df['clean_Sentence'] = df['Sentence'].progress_map(preprocessing)

df

100%|██████████| 2313/2313 [00:01<00:00, 2022.25it/s]


Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to,clean_Sentence
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45,charge night skip taking cord good battery life
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86,charge night skip taking cord good battery life
2,1316,The tech guy then said the service center does...,service center,negative,27,41,tech guy said service center not 1-to-1 exchan...
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121,tech guy said service center not 1-to-1 exchan...
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12,tech guy said service center not 1-to-1 exchan...
...,...,...,...,...,...,...,...
2352,2272,We also use Paralles so we can run virtual mac...,Windows 7 Home Premium,neutral,80,102,also use paralles run virtual machine window x...
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134,also use paralles run virtual machine window x...
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170,also use paralles run virtual machine window x...
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,positive,130,136,"toshiba handle repair seems vary , folk indica..."


# N_gram

In [8]:
count_vect_word = CountVectorizer(analyzer='word', ngram_range=(2,3), min_df=1, max_features=5000)
count_vect_word_aspect = CountVectorizer(analyzer='word', ngram_range=(1,4), min_df=1, max_features=500)

In [35]:
X = df[['clean_Sentence','Aspect Term']]

In [36]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the 'polarity' column
y = le.fit_transform(df['polarity'])

# Retrieve the original class labels
class_labels = le.classes_

# Create a dictionary mapping class labels to encoded values
label_mapping = {class_label: encoded_value for class_label, encoded_value in zip(class_labels, range(len(class_labels)))}

# Display the label mapping
print("Class Label Mapping:")
for class_label, encoded_value in label_mapping.items():
    print(f"{class_label}: {encoded_value}")


Class Label Mapping:
negative: 0
neutral: 1
positive: 2


# Split the data

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train , X_test, y_train , y_test = train_test_split(X,y, test_size=0.33)

In [39]:
X_train

Unnamed: 0,clean_Sentence,Aspect Term
1222,loving netbook amazing screen display small li...,display
671,see macbook pro different may huge price tag c...,updates
1691,ilife software come computer simple use produc...,use
1582,'s also fairly easy use operating system,Operating System
1780,started randomly ceasing charge plugged ( mous...,charging
...,...,...
2261,trouble finding case would fit,case
2046,"highly rated , would like ? tried keyboard sto...",keyboard
113,10 hour battery life 're web browsing word edi...,movie playing
1605,battery lasting 6 hour surfing web sunday chec...,surfing the web


# SVM & DecisionTreeClassifier

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

clf_DecisionTreeClassifier = DecisionTreeClassifier()
clf_SVC = svm.SVC()

In [42]:
X_train_sentence = count_vect_word.fit_transform(X_train['clean_Sentence'])
X_train_aspect = count_vect_word_aspect.fit_transform(X_train['Aspect Term'])

In [43]:
from scipy.sparse import hstack
X_train_combined = hstack([X_train_sentence, X_train_aspect])

In [44]:
X_test_sentence = count_vect_word.transform(X_test['clean_Sentence'])
X_test_aspect = count_vect_word_aspect.fit_transform(X_test['Aspect Term'])

In [45]:
X_test_combined = hstack([X_test_sentence, X_test_aspect])

In [46]:
clf_SVC.fit(X_train_combined , y_train)
clf_DecisionTreeClassifier.fit(X_train_combined , y_train)

In [47]:
y_predict_DecisionTreeClassifier = clf_DecisionTreeClassifier.predict(X_test_combined)

y_predict_SVC = clf_SVC.predict(X_test_combined)

# Predict

In [48]:
from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score

print('accuracy_score ', accuracy_score(y_test, y_predict_DecisionTreeClassifier))
print('f1_score       ' , f1_score(y_test, y_predict_DecisionTreeClassifier ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_DecisionTreeClassifier, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_DecisionTreeClassifier , average='macro'))

accuracy_score  0.4620418848167539
f1_score        0.4302700163213388
precision_score 0.4738078273946491
recall_score    0.4385139081373459


In [49]:
from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score

print('accuracy_score ', accuracy_score(y_test, y_predict_SVC))
print('f1_score       ' , f1_score(y_test, y_predict_SVC ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_SVC, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_SVC , average='macro'))

accuracy_score  0.5458115183246073
f1_score        0.4814826525502798
precision_score 0.5952309241838258
recall_score    0.4860838164522631


In [26]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from scipy.sparse import hstack

# Define the custom dataset class
class MyCSVDataset(Dataset):
    def __init__(self, csv_file):
        self.dataframe = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sentence = self.dataframe.iloc[idx]['clean_Sentence']
        aspect = self.dataframe.iloc[idx]['Aspect Term']
        polarity = self.dataframe.iloc[idx]['polarity']
        return sentence, aspect, polarity

# Load the dataset
dataset = MyCSVDataset('/content/Laptop_Train_v2.csv')

# Create the dataloader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize TfidfVectorizer
tf_idf_vect = TfidfVectorizer(ngram_range = (1,3), max_features = 10000)

# Fit and transform the 'clean_Sentence' and 'Aspect Term' columns
X_train_sentence = tf_idf_vect.fit_transform(X_train['clean_Sentence'])
X_train_aspect = tf_idf_vect.transform(X_train['Aspect Term'])

X_test_sentence = tf_idf_vect.transform(X_test['clean_Sentence'])
X_test_aspect = tf_idf_vect.transform(X_test['Aspect Term'])

# Combine the 'clean_Sentence' and 'Aspect Term' TF-IDF vectors
X_train_combined = hstack([X_train_sentence, X_train_aspect])
X_test_combined = hstack([X_test_sentence, X_test_aspect])

# Define the classifiers
clf_SVC = svm.SVC()
clf_DecisionTreeClassifier = DecisionTreeClassifier()

# Fit the classifiers
clf_SVC.fit(X_train_combined , y_train)
clf_DecisionTreeClassifier.fit(X_train_combined , y_train)

# Make predictions
y_predict_SVC = clf_SVC.predict(X_test_combined)
y_predict_DecisionTreeClassifier = clf_DecisionTreeClassifier.predict(X_test_combined)

In [27]:
from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score

print('Metrics for SVC:')
print('accuracy_score ', accuracy_score(y_test, y_predict_SVC))
print('f1_score       ' , f1_score(y_test, y_predict_SVC ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_SVC, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_SVC , average='macro'))

print('\nMetrics for Decision Tree Classifier:')
print('accuracy_score ', accuracy_score(y_test, y_predict_DecisionTreeClassifier))
print('f1_score       ' , f1_score(y_test, y_predict_DecisionTreeClassifier ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_DecisionTreeClassifier, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_DecisionTreeClassifier , average='macro'))

Metrics for SVC:
accuracy_score  0.6727748691099477
f1_score        0.5602957495054759
precision_score 0.6573714356155053
recall_score    0.5811425411425412

Metrics for Decision Tree Classifier:
accuracy_score  0.662303664921466
f1_score        0.6233028801871997
precision_score 0.6291459306114081
recall_score    0.6203122003122002


# Test using example

In [50]:
import numpy as np

# Assuming X_test and y_test are your testing data and labels
random_index = np.random.randint(0, len(X_test))

# Select a random row from X_test
random_test_text = X_test.iloc[random_index]['clean_Sentence']
random_test_aspect = X_test.iloc[random_index]['Aspect Term']
print("sentence: ", random_test_text)
print("aspect: ", random_test_aspect)

# Transform the random test text using CountVectorizer
random_test_text_transformed = count_vect_word.transform([random_test_text])

# Transform the aspect using CountVectorizer for aspects
aspect_transformed = count_vect_word_aspect.transform([random_test_aspect])

# Concatenate the transformed text and aspect
combined_features = hstack([random_test_text_transformed, aspect_transformed])

# Predict using the classifier
prediction_SVC = clf_SVC.predict(combined_features)
prediction_DecisionTree = clf_DecisionTreeClassifier.predict(combined_features)

# Actual label from the test set
actual_label = y_test[random_index]

# Compare predictions with the actual label
print("Actual Label:", actual_label)
print("SVC Prediction:", prediction_SVC)
print("DecisionTree Prediction:", prediction_DecisionTree)


sentence:  enjoy toshib force durability unparalleled
aspect:  durability
Actual Label: 2
SVC Prediction: [0]
DecisionTree Prediction: [2]


In [None]:
import numpy as np

# Assuming X_test and y_test are your testing data and labels
random_index = np.random.randint(0, len(X_test))

# Select a random row from X_test
random_test_text = X_test.iloc[random_index]['clean_Sentence']
random_test_aspect = X_test.iloc[random_index]['Aspect Term']
print("sentence: ", random_test_text)
print("aspect: ", random_test_aspect)

# Transform the random test text using CountVectorizer
random_test_text_transformed = count_vect_word.transform([random_test_text])

# Transform the aspect using CountVectorizer for aspects
aspect_transformed = count_vect_word_aspect.transform([random_test_aspect])

# Concatenate the transformed text and aspect
combined_features = hstack([random_test_text_transformed, aspect_transformed])

# Predict using the classifier
prediction_SVC = clf_SVC.predict(combined_features)
prediction_DecisionTree = clf_DecisionTreeClassifier.predict(combined_features)

# Actual label from the test set
actual_label = y_test[random_index]

# Compare predictions with the actual label
print("Actual Label:", actual_label)
print("SVC Prediction:", prediction_SVC)
print("DecisionTree Prediction:", prediction_DecisionTree)


sentence:  also got added bonus 30 '' hd monitor , really help extend screen keep eye fresh
aspect:  30" HD Monitor
Actual Label: 3
SVC Prediction: [3]
DecisionTree Prediction: [3]


# Another Way
### Combine "sentence" and "aspect" into a new column "combined"

In [None]:

df['combined'] = df['clean_Sentence'] + ' ' + df['Aspect Term']
df.head()

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to,clean_Sentence,combined
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45,charge night skip taking cord good battery life,charge night skip taking cord good battery lif...
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86,charge night skip taking cord good battery life,charge night skip taking cord good battery lif...
2,1316,The tech guy then said the service center does...,service center,negative,27,41,tech guy said service center not 1-to-1 exchan...,tech guy said service center not 1-to-1 exchan...
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121,tech guy said service center not 1-to-1 exchan...,tech guy said service center not 1-to-1 exchan...
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12,tech guy said service center not 1-to-1 exchan...,tech guy said service center not 1-to-1 exchan...


In [None]:
count_vect_word = CountVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, max_features=10000)

X = df[['combined']]

X_train , X_test, y_train , y_test = train_test_split(X,y, test_size=0.33)

X_train

Unnamed: 0,combined
1563,"battery hold well , 's built solidly , run fas..."
555,"17 inch screen large , computer light 17 inch ..."
1955,enjoy toshib force durability unparalleled dur...
770,"though picture , video , music software nowher..."
21,"shop , macbooks encased soft rubber enclosure ..."
...,...
1200,battery n't last long 'm sure upgrade battery ...
1229,"screen graphic clarity , sharpness great sharp..."
39,"replace battery , couple month ago 's working ..."
1380,"course bought 3 year warranty , sent replaced ..."


In [None]:
clf_DecisionTreeClassifier_1 = DecisionTreeClassifier()
clf_SVC_1 = svm.SVC()


X_train_sentence_1 = count_vect_word.fit_transform(X_train['combined'])

X_test_sentence_1 = count_vect_word.transform(X_test['combined'])


clf_SVC.fit(X_train_sentence_1 , y_train)
clf_DecisionTreeClassifier.fit(X_train_sentence_1 , y_train)


y_predict_DecisionTreeClassifier = clf_DecisionTreeClassifier.predict(X_test_sentence_1)

y_predict_SVC = clf_SVC.predict(X_test_sentence_1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score

print('accuracy_score ', accuracy_score(y_test, y_predict_DecisionTreeClassifier))
print('f1_score       ' , f1_score(y_test, y_predict_DecisionTreeClassifier ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_DecisionTreeClassifier, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_DecisionTreeClassifier , average='macro'))

accuracy_score  0.6585365853658537
f1_score        0.4750554667668207
precision_score 0.47647515855391265
recall_score    0.47597809991427015


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score

print('accuracy_score ', accuracy_score(y_test, y_predict_SVC))
print('f1_score       ' , f1_score(y_test, y_predict_SVC ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_SVC, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_SVC , average='macro'))

accuracy_score  0.6765083440308087
f1_score        0.46413273387383425
precision_score 0.5155050926193695
recall_score    0.4658585651936715


  _warn_prf(average, modifier, msg_start, len(result))


# Test using example

In [None]:
import numpy as np

# Assuming X_test and y_test are your testing data and labels
random_index = np.random.randint(0, len(X_test))

# Select a random row from X_test
random_test_text = X_test.iloc[random_index]['combined']
random_test_aspect = random_test_text.split()[-1]
print("sentence: ", random_test_text)
print("aspect: ", random_test_aspect)

# Transform the random test text using CountVectorizer
random_test_text_transformed = count_vect_word.transform([random_test_text])

# Predict using the classifier
prediction_SVC = clf_SVC.predict(random_test_text_transformed)
prediction_DecisionTree = clf_DecisionTreeClassifier.predict(random_test_text_transformed)

# Actual label from the test set
actual_label = y_test[random_index]

# Compare predictions with the actual label
print("Actual Label:", actual_label)
print("SVC Prediction:", prediction_SVC)
print("DecisionTree Prediction:", prediction_DecisionTree)


sentence:  pc user use powerpoint program slide-show presentation mac user utilize keynote Powerpoint program
aspect:  program
Actual Label: 2
SVC Prediction: [2]
DecisionTree Prediction: [2]


#TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vect = TfidfVectorizer(ngram_range = (1,3), max_features = 10000)

In [None]:
clf_DecisionTreeClassifier_1 = DecisionTreeClassifier()
clf_SVC_1 = svm.SVC()


X_train_sentence_1 = tf_idf_vect.fit_transform(X_train['combined'])

X_test_sentence_1 = tf_idf_vect.transform(X_test['combined'])


clf_SVC.fit(X_train_sentence_1 , y_train)
clf_DecisionTreeClassifier.fit(X_train_sentence_1 , y_train)


y_predict_DecisionTreeClassifier = clf_DecisionTreeClassifier.predict(X_test_sentence_1)

y_predict_SVC = clf_SVC.predict(X_test_sentence_1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score

print('accuracy_score ', accuracy_score(y_test, y_predict_DecisionTreeClassifier))
print('f1_score       ' , f1_score(y_test, y_predict_DecisionTreeClassifier ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_DecisionTreeClassifier, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_DecisionTreeClassifier , average='macro'))

accuracy_score  0.6225930680359435
f1_score        0.45295689871448064
precision_score 0.4500383625897122
recall_score    0.45674245966799154


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score , recall_score

print('accuracy_score ', accuracy_score(y_test, y_predict_SVC))
print('f1_score       ' , f1_score(y_test, y_predict_SVC ,average='macro'))
print('precision_score', precision_score(y_test, y_predict_SVC, average='macro'))
print('recall_score   ' , recall_score(y_test, y_predict_SVC , average='macro'))

accuracy_score  0.6983311938382541
f1_score        0.49258157808577974
precision_score 0.516111289459604
recall_score    0.4914167835710389


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import numpy as np

# Assuming X_test and y_test are your testing data and labels
random_index = np.random.randint(0, len(X_test))

# Select a random row from X_test
random_test_text = X_test.iloc[random_index]['combined']
random_test_aspect = random_test_text.split()[-1]
print("sentence: ", random_test_text)
print("aspect: ", random_test_aspect)

# Transform the random test text using CountVectorizer
random_test_text_transformed = count_vect_word.transform([random_test_text])

# Predict using the classifier
prediction_SVC = clf_SVC.predict(random_test_text_transformed)
prediction_DecisionTree = clf_DecisionTreeClassifier.predict(random_test_text_transformed)

# Actual label from the test set
actual_label = y_test[random_index]

# Compare predictions with the actual label
print("Actual Label:", actual_label)
print("SVC Prediction:", prediction_SVC)
print("DecisionTree Prediction:", prediction_DecisionTree)


sentence:  great product apple new great looking design design
aspect:  design
Actual Label: 3
SVC Prediction: [3]
DecisionTree Prediction: [3]
