In [103]:
import csv
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [104]:
# Step 1: Load the data

crtDir =  os.getcwd()
fileName = os.path.join(crtDir, 'data', 'reviews_mixed.csv')

data = []
with open(fileName) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            dataNames = row
        else:
            data.append(row)
        line_count += 1

inputs = [data[i][0] for i in range(len(data))][:100]
outputs = [data[i][1] for i in range(len(data))][:100]
labelNames = list(set(outputs))

print(inputs[:2])
print(labelNames[:2])

['The rooms are extremely small, practically only a bed.', 'Room safe did not work.']
['negative', 'positive']


In [127]:
# Step 2: Split the data

import numpy as np

np.random.seed(5)
# noSamples = inputs.shape[0]
noSamples = len(inputs)
indexes = [i for i in range(noSamples)]
trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace = False)
testSample = [i for i in indexes  if not i in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
testInputs = [inputs[i] for i in testSample]
testOutputs = [outputs[i] for i in testSample]

print(trainInputs[:3])
print(testInputs[:3])

['Just to give you an idea: the shutters of the windows were not working, did not go neither up or down - just hanging down only one side and the other up....', 'and hip and CLEAN!', "Toilet paper wasn't replaced everyday!"]
['The bed is very comfortable.', 'Very spacious rooms, quiet and very comfortable.', 'Corridors filthy\nRoom filthy\nElectrical cables in room not safe\nWhole building smelly\nShower repulsive']


In [134]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
# Step 3: Feature extraction
# Mod 1: TF_IDF

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=100)

# trainFeatures = vectorizer.fit_transform(trainInputs)
# testFeatures = vectorizer.transform(testInputs)

#Mod 2: Doc2Vec
tagged_train_texts = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(trainInputs)]
tagged_test_texts = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(testInputs)]

#fiecare document: vector, size=100
doc2vec_model = Doc2Vec(vector_size=100, min_count=2, epochs=100, workers=4)
doc2vec_model.build_vocab(tagged_train_texts)
doc2vec_model.train(tagged_train_texts, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

trainFeatures = np.array([doc2vec_model.infer_vector(text.split()) for text in trainInputs])
testFeatures = np.array([doc2vec_model.infer_vector(text.split()) for text in testInputs])




In [107]:
from MyANN import MyANN


In [135]:
# Step 4: Train a model

from sklearn.linear_model import LogisticRegression

supervisedClassifier = LogisticRegression(max_iter=1000)

supervisedClassifier.fit(trainFeatures, trainOutputs)

###########################################################

#Train my model
from sklearn.preprocessing import LabelEncoder

model = MyANN()

label_encoder = LabelEncoder()
trainOutputsModel = label_encoder.fit_transform(trainOutputs)
testOutputsModel = label_encoder.transform(testOutputs)

model.fit(trainFeatures.toarray(), trainOutputsModel)

Iteration 0, loss = 0.7035
Iteration 10, loss = 0.7025
Iteration 20, loss = 0.7016
Iteration 30, loss = 0.7006
Iteration 40, loss = 0.6997
Iteration 50, loss = 0.6988
Iteration 60, loss = 0.6978
Iteration 70, loss = 0.6969
Iteration 80, loss = 0.6960
Iteration 90, loss = 0.6951
Iteration 100, loss = 0.6943
Iteration 110, loss = 0.6934
Iteration 120, loss = 0.6925
Iteration 130, loss = 0.6916
Iteration 140, loss = 0.6908
Iteration 150, loss = 0.6899
Iteration 160, loss = 0.6890
Iteration 170, loss = 0.6882
Iteration 180, loss = 0.6874
Iteration 190, loss = 0.6865
Iteration 200, loss = 0.6857
Iteration 210, loss = 0.6849
Iteration 220, loss = 0.6841
Iteration 230, loss = 0.6833
Iteration 240, loss = 0.6825
Iteration 250, loss = 0.6817
Iteration 260, loss = 0.6809
Iteration 270, loss = 0.6801
Iteration 280, loss = 0.6793
Iteration 290, loss = 0.6786
Iteration 300, loss = 0.6778
Iteration 310, loss = 0.6771
Iteration 320, loss = 0.6763
Iteration 330, loss = 0.6756
Iteration 340, loss = 0.6

In [136]:
#Step 5: Testare model
computedTestOutputs = supervisedClassifier.predict(testFeatures)

from sklearn.metrics import accuracy_score

print("acc: ", accuracy_score(testOutputs, computedTestOutputs))

acc:  0.7


In [137]:
#Step 5: Testare model Manual
computedTestOutputsManual = model.predict(testFeatures.toarray())

from sklearn.metrics import accuracy_score

print("acc: ", accuracy_score(testOutputsModel, computedTestOutputsManual))

acc:  0.55


In [125]:
#Step 6: Predict sentiment for a text

input_text = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement."]

input_features = vectorizer.transform(input_text)

predicted_sentiment = supervisedClassifier.predict(input_features)
predicted_sentimentManual = model.predict(input_features.toarray())


print(predicted_sentiment)
print(predicted_sentimentManual)


NotFittedError: The TF-IDF vectorizer is not fitted

In [138]:
#Step 6: Predict sentiment for a text using Doc2Vec

input_text = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement."]
input_features = np.array([doc2vec_model.infer_vector(text.split()) for text in input_text])
predicted_sentiment = supervisedClassifier.predict(input_features)
predicted_sentimentManual = model.predict(input_features)


print(predicted_sentiment)
print(predicted_sentimentManual)


['negative']
[[0]]
