## Step 1 and 2: Load the Data

In [None]:
# Load the data and get the sentences from the data
import pandas as pd

amazon_data = pd.read_csv("/content/drive/MyDrive/CSCI5750/sentiment labelled sentences/amazon_cells_labelled.txt", sep = '\t', names=['sentence', 'label'])

imdb_data = pd.read_csv("/content/drive/MyDrive/CSCI5750/sentiment labelled sentences/imdb_labelled.txt", sep = '\t', names=['sentence', 'label'])

yelp_data = pd.read_csv("/content/drive/MyDrive/CSCI5750/sentiment labelled sentences/yelp_labelled.txt", sep = '\t', names=['sentence', 'label'])

## Step 3: Tokenization

In [None]:
# get the frequency of each class in the yelp dataset
count_class_0 = (yelp_data['label'] == 0).sum()
count_class_1 = (yelp_data['label'] == 1).sum()

print("Frequency of class 0:", count_class_0, "Frequency of class 1:", count_class_1)

Frequency of class 0: 500 Frequency of class 1: 500


In [None]:
# get the frequency of each class in the amazon dataset
count_class_0 = (amazon_data['label'] == 0).sum()
count_class_1 = (amazon_data['label'] == 1).sum()

print("Frequency of class 0:", count_class_0, "Frequency of class 1:", count_class_1)

Frequency of class 0: 500 Frequency of class 1: 500


In [None]:
# get the frequency of each class in the imdb dataset
count_class_0 = (imdb_data['label'] == 0).sum()
count_class_1 = (imdb_data['label'] == 1).sum()

print("Frequency of class 0:", count_class_0, "Frequency of class 1:", count_class_1)

Frequency of class 0: 362 Frequency of class 1: 386


## Step 4: Data Preprocessing and Word Count Table

In [None]:
# get the corpus from the Yelp dataset
yelp_sentence = yelp_data['sentence'].to_list()

In [None]:
# print out the list of all documents
yelp_sentence

['Wow... Loved this place.',
 'Crust is not good.',
 'Not tasty and the texture was just nasty.',
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
 'The selection on the menu was great and so were the prices.',
 'Now I am getting angry and I want my damn pho.',
 "Honeslty it didn't taste THAT fresh.)",
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
 'The fries were great too.',
 'A great touch.',
 'Service was very prompt.',
 'Would not go back.',
 'The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.',
 'I tried the Cape Cod ravoli, chicken,with cranberry...mmmm!',
 'I was disgusted because I was pretty sure that was human hair.',
 'I was shocked because no signs indicate cash only.',
 'Highly recommended.',
 'Waitress was a little slow in service.',
 'This place is not worth your time, let alone Vegas.',
 'did not like at all.'

In [None]:
# count how many sentences we have in the yelp dataset
len(yelp_sentence)

1000

In [None]:
# get the corpus from the Amazon dataset
amazon_sentence = amazon_data['sentence'].to_list()

In [None]:
# print out the list of all documents
amazon_sentence

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.',
 'I have to jiggle the plug to get it to line up right to get decent volume.',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.',
 'If you are Razr owner...you must have this!',
 'Needless to say, I wasted my money.',
 'What a waste of money and time!.',
 'And the sound quality is great.',
 'He was very impressed when going from the original battery to the extended battery.',
 'If the two were seperated by a mere 5+ ft I started to notice excessive static and garbled sound from the headset.',
 'Very good quality though',
 'The design is very odd, as the ear "clip" is not very comfortable at all.',
 'Highly recommend for any one who has a blue tooth phone.',
 'I advise EVERYO

In [None]:
# count how many sentences we have in the Amazon dataset
len(amazon_sentence)

1000

In [None]:
# get the corpus from the imdb dataset
imdb_sentence = imdb_data['sentence'].to_list()

In [None]:
# print out the list of all documents
imdb_sentence

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
 'Very little music or anything to speak of.  ',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  ",
 'Wasted two hours.  ',
 'Saw the movie today and thought it was a good effort, good messages for kids.  ',
 'A bit predictable.  ',
 'Loved the casting of Jimmy Buffet as the science teacher.  ',
 'And those baby owls were adorable.  ',
 "The movie showed a lot of Florida at it's best, made it look very appealing.  ",
 'The Son

In [None]:
# count how many sentences we have in the imdb dataset
len(imdb_sentence)

748

### Perform analysis on the Yelp Dataset

In [None]:
# creating vocabulary to map words into their index using CountVectorizer provided by the scikit-learn library
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, lowercase=False, stop_words='english') # will also remove punctuation or stop words

print("Test sequence: ",yelp_sentence[0:2])
vectorizer.fit(yelp_sentence[0:2]) # test the vocabulary on first two sentences
print("Vocabulary: ",vectorizer.vocabulary_)

Test sequence:  ['Wow... Loved this place.', 'Crust is not good.']
Vocabulary:  {'Wow': 2, 'Loved': 1, 'place': 4, 'Crust': 0, 'good': 3}


In [None]:
## split the data into train/test datasets with ratio 80/20
from sklearn.model_selection import train_test_split

yelp_labels = yelp_data['label']

sentences_train_yelp, sentences_test_yelp, labels_train_yelp, labels_test_yelp = train_test_split(
    yelp_sentence,
    yelp_labels,
    test_size=0.2,
    random_state=42
)

In [None]:
## Vectorize the training/testing dataset
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, lowercase=False,stop_words='english') # will also remove punctuation or stop words
vectorizer.fit(sentences_train_yelp) ### fit on the whole dataset

In [None]:
## check the vocabulary
print("Vocabulary: ",vectorizer.vocabulary_)
print("Vocabulary words: ",vectorizer.vocabulary_.keys())
print("Vocabulary index: ",vectorizer.vocabulary_.values())

Vocabulary index:  dict_values([376, 1804, 1475, 1482, 31, 835, 1237, 1440, 843, 149, 418, 1158, 1518, 1551, 1665, 1624, 1701, 988, 210, 950, 842, 942, 1020, 792, 708, 1788, 104, 381, 565, 909, 508, 914, 1666, 1812, 1759, 32, 1688, 1611, 45, 166, 901, 1038, 1348, 1196, 760, 1749, 1762, 1007, 912, 1399, 610, 504, 1601, 1175, 742, 653, 578, 146, 1363, 1330, 940, 910, 1452, 799, 65, 1699, 1708, 941, 933, 1779, 123, 1505, 57, 1617, 1293, 1412, 811, 947, 1792, 938, 622, 1476, 450, 336, 1391, 1268, 1087, 1122, 384, 952, 672, 251, 745, 1513, 201, 1227, 557, 1041, 1579, 1325, 1246, 1486, 1801, 419, 985, 1362, 1387, 250, 167, 1474, 790, 1649, 1327, 1424, 1328, 1506, 265, 1659, 1229, 262, 805, 1352, 1605, 975, 416, 1483, 1201, 883, 924, 1111, 1265, 673, 199, 1819, 1588, 1370, 1728, 1504, 401, 1654, 754, 1078, 217, 833, 1028, 1231, 1814, 1134, 1647, 1255, 1631, 617, 1770, 944, 532, 858, 1381, 869, 1140, 882, 539, 1074, 498, 820, 715, 454, 798, 1310, 24, 1402, 1447, 675, 1747, 756, 420, 471, 973, 

In [None]:
## create feature vector for each sentence
X_train = vectorizer.transform(sentences_train_yelp).toarray()
X_test = vectorizer.transform(sentences_test_yelp).toarray()
print("Training matrix shape", X_train.shape)
print("Testing matrix shape", X_test.shape)

Training matrix shape (800, 1820)
Testing matrix shape (200, 1820)


In [None]:
# save the word count table to local file
yelp_feature_counts = vectorizer.transform(sentences_train_yelp)
# convert to df
yelp_feature_counts_df = pd.DataFrame(yelp_feature_counts.toarray(), columns=vectorizer.get_feature_names_out())
# save as a csv
yelp_feature_counts_df.to_csv('yelp_feature_counts.csv', index=False)
yelp_feature_counts_df

Unnamed: 0,10,100,12,17,1979,20,2007,30,30s,35,...,wrong,ya,yay,year,years,yellow,yucky,yum,yummy,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Perform analysis on the Amazon Dataset

In [None]:
# creating vocabulary to map words into their index using CountVectorizer provided by the scikit-learn library
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, lowercase=False, stop_words='english') # will also remove punctuation or stop words

print("Test sequence: ", amazon_sentence[0:2])
vectorizer.fit(amazon_sentence[0:2]) # test the vocabulary on first two sentences
print("Vocabulary: ",vectorizer.vocabulary_)

Test sequence:  ['So there is no way for me to plug it in here in the US unless I go by a converter.', 'Good case, Excellent value.']
Vocabulary:  {'So': 2, 'way': 9, 'plug': 6, 'US': 3, 'unless': 7, 'converter': 5, 'Good': 1, 'case': 4, 'Excellent': 0, 'value': 8}


In [None]:
## split the data into train/test datasets with ratio 80/20
from sklearn.model_selection import train_test_split

amazon_labels = amazon_data['label']

sentences_train_amazon, sentences_test_amazon, labels_train_amazon, labels_test_amazon = train_test_split(
    amazon_sentence,
    amazon_labels,
    test_size=0.2,
    random_state=42
)

In [None]:
## Vectorize the training/testing dataset
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, lowercase=False,stop_words='english') # will also remove punctuation or stop words
vectorizer.fit(sentences_train_amazon) ### fit on the whole dataset

In [None]:
## check the vocabulary
print("Vocabulary: ",vectorizer.vocabulary_)
print("Vocabulary words: ",vectorizer.vocabulary_.keys())
print("Vocabulary index: ",vectorizer.vocabulary_.values())

Vocabulary index:  dict_values([139, 992, 617, 45, 1589, 789, 1021, 1065, 700, 706, 1018, 1079, 137, 264, 580, 1657, 1634, 995, 1327, 871, 915, 826, 632, 808, 202, 998, 1320, 306, 1599, 187, 421, 452, 311, 1707, 1472, 64, 1624, 386, 543, 787, 248, 1281, 579, 1264, 703, 1084, 1252, 1475, 1495, 1101, 1423, 146, 754, 1160, 587, 854, 792, 502, 523, 258, 886, 955, 1648, 1332, 553, 982, 1395, 186, 600, 273, 797, 1109, 1190, 1582, 1500, 958, 528, 1699, 1378, 1227, 588, 1709, 483, 138, 214, 648, 1667, 1307, 1016, 506, 1639, 203, 1536, 562, 365, 1370, 1355, 1334, 1700, 439, 1603, 1620, 866, 1172, 318, 540, 619, 1427, 1137, 132, 1051, 1543, 1535, 775, 1697, 441, 218, 316, 1110, 1666, 1139, 565, 810, 86, 1338, 952, 727, 1083, 1606, 597, 690, 991, 453, 718, 1438, 226, 904, 422, 545, 755, 1616, 1305, 1278, 1465, 521, 984, 678, 1416, 685, 976, 393, 1312, 1013, 1257, 1090, 807, 1432, 442, 1661, 1578, 1175, 686, 612, 1115, 987, 1649, 1005, 830, 234, 860, 1194, 986, 892, 649, 634, 1116, 1076, 701, 395,

In [None]:
## create feature vector for each sentence
X_train = vectorizer.transform(sentences_train_amazon).toarray()
X_test = vectorizer.transform(sentences_test_amazon).toarray()
print("Training matrix shape", X_train.shape)
print("Testing matrix shape", X_test.shape)

Training matrix shape (800, 1713)
Testing matrix shape (200, 1713)


In [None]:
# save the word count table to local file
amazon_feature_counts = vectorizer.transform(sentences_train_amazon)
# convert to df
amazon_feature_counts_df = pd.DataFrame(amazon_feature_counts.toarray(), columns=vectorizer.get_feature_names_out())
# save as a csv
amazon_feature_counts_df.to_csv('amazon_feature_counts.csv', index=False)
amazon_feature_counts_df

Unnamed: 0,10,100,11,12,13,15,15g,20,2000,2005,...,worth,worthless,worthwhile,wouldn,wrong,year,years,yell,z500a,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Perform analysis on the imdb Dataset

In [None]:
# creating vocabulary to map words into their index using CountVectorizer provided by the scikit-learn library
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, lowercase=False, stop_words='english') # will also remove punctuation or stop words

print("Test sequence: ",imdb_sentence[0:2])
vectorizer.fit(imdb_sentence[0:2]) # test the vocabulary on first two sentences
print("Vocabulary: ",vectorizer.vocabulary_)

Test sequence:  ['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ', 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ']
Vocabulary:  {'slow': 13, 'moving': 11, 'aimless': 1, 'movie': 10, 'distressed': 4, 'drifting': 5, 'young': 16, 'man': 9, 'Not': 0, 'sure': 14, 'lost': 8, 'flat': 6, 'characters': 3, 'audience': 2, 'nearly': 12, 'half': 7, 'walked': 15}


In [None]:
## split the data into train/test datasets with ratio 80/20
from sklearn.model_selection import train_test_split

imdb_labels = imdb_data['label']

sentences_train_imdb, sentences_test_imdb, labels_train_imdb, labels_test_imdb = train_test_split(
    imdb_sentence,
    imdb_labels,
    test_size=0.2,
    random_state=42
)

In [None]:
## Vectorize the training/testing dataset
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, lowercase=False,stop_words='english') # will also remove punctuation or stop words
vectorizer.fit(sentences_train_imdb) ### fit on the whole dataset

In [None]:
## check the vocabulary
print("Vocabulary: ",vectorizer.vocabulary_)
print("Vocabulary words: ",vectorizer.vocabulary_.keys())
print("Vocabulary index: ",vectorizer.vocabulary_.values())

Vocabulary:  {'This': 592, 'film': 1363, 'highlights': 1535, 'fundamental': 1423, 'flaws': 1385, 'legal': 1702, 'process': 2046, 'discovering': 1158, 'guilt': 1489, 'innocence': 1610, 'presents': 2035, 'better': 827, 'court': 1046, 'movie': 1841, 'suffered': 2416, 'writing': 2735, 'needed': 1864, 'suspense': 2438, 'Very': 632, 'disappointing': 1153, 'There': 587, 'moments': 1826, 'just': 1659, 'didn': 1140, 'need': 1863, 'excruciatingly': 1286, 'slow': 2311, 'moving': 1843, 've': 2630, 'seen': 2238, 'soap': 2320, 'operas': 1908, 'intelligent': 1627, 'liked': 1719, 'While': 659, 'don': 1180, 'hear': 1517, 'Mickey': 411, 'speak': 2342, 'tons': 2519, 'sound': 2337, 'effects': 1221, 'music': 1849, 'granted': 1474, 'huge': 1566, 'crowd': 1067, 'pleaser': 1995, '1928': 6, 'The': 584, 'characters': 928, 'fleshed': 1386, 'surprisingly': 2434, 'particularly': 1946, 'Grimes': 269, 'Blake': 95, 'actors': 700, 'deliver': 1105, 'sharply': 2266, 'scripted': 2227, 'lines': 1725, 'right': 2173, 'deadp

In [None]:
## create feature vector for each sentence
X_train = vectorizer.transform(sentences_train_imdb).toarray()
X_test = vectorizer.transform(sentences_test_imdb).toarray()
print("Training matrix shape", X_train.shape)
print("Testing matrix shape", X_test.shape)

Training matrix shape (598, 2750)
Testing matrix shape (150, 2750)


In [None]:
# save the word count table to local file
imdb_feature_counts = vectorizer.transform(sentences_train_imdb)
# convert to df
imdb_feature_counts_df = pd.DataFrame(imdb_feature_counts.toarray(), columns=vectorizer.get_feature_names_out())
# save as a csv
imdb_feature_counts_df.to_csv('imdb_feature_counts.csv', index=False)
imdb_feature_counts_df

Unnamed: 0,10,12,13,15,15pm,18th,1928,1947,1948,1971,...,yeah,year,years,yelps,young,younger,youthful,youtube,zillion,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
596,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
