In [35]:
from numpy import loadtxt
import pandas as pd

# to feed and train using the dev csv
data_train = pd.read_csv('sentiment_dataset_dev.csv')
# to feed and train using the train csv
# data_train = pd.read_csv('sentiment_dataset_train.csv')

data_test = pd.read_csv('sentiment_dataset_test.csv')
# dataset.head()
dataset = data_train.append(data_test)
# print(len(data_train))
# print(len(data_test))
# print(len(dataset))
# print(data_train)
# print(data_test)
# print(dataset)

# select only relevant columns
#dataset = dataset[["review", "rating"]]

# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset["review"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = dataset["review"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
dataset = pd.concat([dataset, doc2vec_df], axis=1)


# add tf-idfs columns
# TF computes the classic number of times the word appears in the text
# IDF computes the relative importance of this word which depends on how many texts the word can be found
# We add TF-IDF columns for every word that appear in at least 10 different texts to filter some of them and reduce the size of the final output.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(dataset["review"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = dataset.index
dataset = pd.concat([dataset, tfidf_df], axis=1)

# feature selection
label = "rating"
ignore_cols = [label, "id", "review"]
features = [c for c in dataset.columns if c not in ignore_cols]

# split the data into train and test for test accuracy
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(dataset[features], dataset[label], test_size = 0.20, random_state = 1)  # 70% training and 30% test
X_train, X_test = dataset[features][:len(data_train)], dataset[features][len(data_train):]
y_train = dataset[label][:len(data_train)]



# classification model we are going to use is the logistic regression
from sklearn.linear_model import LogisticRegression
# Create LogisticRegression classifer object
clf = LogisticRegression(max_iter=600)
# Train Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
# print("Accuracy:",clf.score(X_test, y_test))

data_test_result = pd.concat([data_test, pd.DataFrame(y_pred, columns=['predictated_rating'])], axis=1)
print("Test Result Data with Predictated Ratings:")
print(data_test_result)
# if we want to save data in csv
# df.to_csv(file_name, sep=',', encoding='utf-8')




Test Result Data with Predictated Review:
        id  ... predictated_rating
0        0  ...                2.0
1        1  ...                5.0
2        3  ...                1.0
3        4  ...                2.0
4        6  ...                3.0
...    ...  ...                ...
6495  7494  ...                2.0
6496  7496  ...                3.0
6497  7497  ...                3.0
6498  7498  ...                2.0
6499  7499  ...                2.0

[6500 rows x 3 columns]
