In [49]:
class article: 
    def __init__(self, title, sent):
        self.title = title
        self.sent = sent

In [56]:
file_path = "./data/all-data.csv"

sentences = []
sentiments = []
data_lines = []

# stores article objects 
articles = []
print(len(articles))

try:
    with open(file_path, encoding='latin1') as f:
        # f.readlines() reads the entire file into a list of strings, one for each line
        data_lines = f.readlines()
        print(f"Successfully read {len(data_lines)} lines from {file_path}")

except FileNotFoundError:
    print(f"ERROR: The file '{file_path}' was not found. Please check the file path.")

for line in data_lines:
    # Split each line at the first comma
    parts = line.strip().split(',', 1)
    
    # The rest of your logic was correct!
    if len(parts) == 2:
        sentiment = parts[0]
        sentence = parts[1].strip(' "') # Removes quotes and spaces from the ends
        
        sentiments.append(sentiment)
        sentences.append(sentence)

        articles.append(article(sentence, sentiment))
        

print(len(articles))

# MORE DATA!!!
file_path1 = "./data/sentences_AllAgree.txt"

data_lines1 = []

try:
    with open(file_path1, encoding='utf-8') as f:
        data_lines1 = f.readlines()
        print(f"Successfully read {len(data_lines1)} lines from {file_path1}")
except IOError as e:
    print(f"ERROR: An error occurred while writing to the file '{file_path1}': {e}")

for line in data_lines1: 
    parts = line.strip().split('.@', 1)

    if len(parts) == 2:
        sentiment = parts[1]
        sentence = parts[0]
        
        sentiments.append(sentiment)
        sentences.append(sentence)

        articles.append(article(sentence, sentiment))
        
print(len(articles))

0
Successfully read 4846 lines from ./data/all-data.csv
4846
Successfully read 2264 lines from ./data/sentences_AllAgree.txt
7054


In [61]:
from sklearn.model_selection import train_test_split 

training, test = train_test_split(articles, test_size = 0.2, random_state = 42)

print(len(training))
print(len(test))

print(training[0])

5643
1411
<__main__.article object at 0x3010494d0>


In [62]:
# x is what we pass in (the title) y is what we get (the sentiment) 
train_x = [x.title for x in training]
train_y = [x.sent for x in training]

test_x = [x.title for x in test]
test_y = [y.sent for y in test]

train_x[0]
train_y[0]

'negative'

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

# makes a new instance of the vectorizer 
vectorizer = CountVectorizer()

# vectorization stage 
#
# this method is composed of two steps: 
# 1. fit: builds a vocabulary vector from the training data
# 2. transform: transforms the training data into a document-term matrix
#
# iterates through all the titles in the train_x list and builds a complete dictionary of all 
# the unique words it encounters. Then makes a matrix that looks like this: 
#
# ex:              I     love    hate    cars 
# I love cars:     1       1      0        1
# I hate cars:     1       0      1        0
#
# Although the finished product is a matrix, each row corresponds to a article title
# train_x_vectors is a "sparce matrix" (basically a more efficient 2d array where the 0 values are not stored)
train_x_vectors = vectorizer.fit_transform(train_x)

# we dont want to fit the vectorizer again on the test data, we just want to transform it
# so we use the same vocabulary that was built from the training data
test_x_vectors = vectorizer.transform(test_x)

print(train_x_vectors.shape)  
print(train_x_vectors[0])



(5643, 9289)
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 14 stored elements and shape (1, 9289)>
  Coords	Values
  (0, 5972)	1
  (0, 5134)	1
  (0, 1001)	1
  (0, 8456)	2
  (0, 3098)	2
  (0, 5533)	2
  (0, 2079)	1
  (0, 6619)	1
  (0, 5905)	2
  (0, 4305)	1
  (0, 8359)	1
  (0, 7442)	1
  (0, 6746)	1
  (0, 206)	1


Support Vector Machine

In [64]:
from sklearn.svm import SVC

# there are many different algorithms that can be used to classify data,
# such as decision trees, random forests, logistic regression, etc.
# here is using support vector machine (SVM) classifier

# fitting the model: the main step
#
# choosing a new, untrained classifier algorithm and train it to find patterns 
# between numerical data and labels 
#
# SVC = support vector classifier machine learning algorithm 
# clf_svm is the classifier object that can be used to make predictions later 
# kernel is a parameter that defines the type of decision boundary to be used. 
# this only pretains to SVM classifiers 
#
# kernel = 'linear' means that the algorithm will try to find a linear decision 
# boundary (where the data is seperated into 2 classes divided by a straight line
# and to predict, it will use the linear decision boundary to classify new data). 
# for non linear (rbf, poly, etc.) the algorithm will try to find a non-linear 
# decision boundary
clf_svm = SVC(kernel='linear')

# fitting this classifier to the training data 
clf_svm.fit(train_x_vectors, train_y)

test_x[0]
test_x_vectors[0]
print(clf_svm.predict(test_x_vectors[0]))
print(clf_svm.predict(test_x_vectors[56]))
print(clf_svm.predict(test_x_vectors[100]))
print(clf_svm.predict(test_x_vectors[500]))

['positive']
['positive']
['positive']
['neutral']


Decision Tree

In [66]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['positive'], dtype='<U8')

Naive Bayes

In [67]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['positive'], dtype='<U8')

Logistic

In [68]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter=1000)
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])

array(['positive'], dtype='<U8')

In [69]:
# Model Evaluation 

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(train_x_vectors, train_y))

from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels = ['positive', 'neutral', 'negative']))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average = None, labels = ['positive', 'neutral', 'negative']))
print(f1_score(test_y, clf_gnb.predict(test_x_vectors), average = None, labels = ['positive', 'neutral', 'negative']))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average = None, labels = ['positive', 'neutral', 'negative']))

# from the output it looks as though all the classifiers perform equally as bad 
# so this might be a data issue rather than a model issue 

0.8462083628632175
0.8454996456413891
0.8447909284195606
0.9930887825624668
[0.77690289 0.88824214 0.78947368]
[0.7688172  0.88939567 0.78395062]
[0.7585266  0.89164786 0.78233438]
[0.79893475 0.90691034 0.825     ]
