In [5]:
from numpy import loadtxt
import pandas as pd

# to feed and train using the train csv
data_train = pd.read_csv('sentiment_dataset_train.csv')
# to test accuracy using the dev csv
data_dev = pd.read_csv('sentiment_dataset_dev.csv')
# the data to predict with exist result to get accuracy
data_test = pd.read_csv('sentiment_dataset_test.csv')
# dataset.head()
dataset = data_train.append(data_dev)
dataset = dataset.append(data_test)

train_len = len(data_train)
dev_len = len(data_dev)
del data_train
del data_dev
# import gc
# gc.collect()
# data_train=pd.DataFrame()
# data_dev=pd.DataFrame()

# select only relevant columns
#dataset = dataset[["review", "rating"]]

# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset["review"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = dataset["review"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
dataset = pd.concat([dataset, doc2vec_df], axis=1)


# add tf-idfs columns
# TF computes the classic number of times the word appears in the text
# IDF computes the relative importance of this word which depends on how many texts the word can be found
# We add TF-IDF columns for every word that appear in at least 10 different texts to filter some of them and reduce the size of the final output.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(dataset["review"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = dataset.index
dataset = pd.concat([dataset, tfidf_df], axis=1)

# feature selection
label = "rating"
ignore_cols = [label, "id", "review"]
features = [c for c in dataset.columns if c not in ignore_cols]

# split the data into train and test for test accuracy
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(dataset[features], dataset[label], test_size = 0.20, random_state = 1)  # 70% training and 30% test
X_train, X_dev = dataset[features][:train_len], dataset[features][train_len:]
X_dev, X_test = X_dev[:dev_len], X_dev[dev_len:]
y_train, y_dev = dataset[label][:train_len], dataset[label][train_len:]
y_dev = y_dev[:dev_len]
del dataset


# classification model we are going to use is the logistic regression
from sklearn.linear_model import LogisticRegression
# Create LogisticRegression classifer object
clf = LogisticRegression(max_iter=600)
# Train Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_test_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
y_dev_pred = clf.predict(X_dev)
# Model Accuracy, how often is the classifier correct?
from sklearn import metrics
# print("Accuracy:",clf.score(X_dev, y_dev))
print("Accuracy:", metrics.accuracy_score(y_dev, y_dev_pred))

data_test_result = pd.concat([data_test, pd.DataFrame(y_test_pred, columns=['predictated_rating'])], axis=1)
print("Test Result Data with Predictated Ratings:")
print(data_test_result)
# if we want to save data in csv
# df.to_csv(file_name, sep=',', encoding='utf-8')




Accuracy: 0.7351927295518647
Test Result Data with Predictated Ratings:
        id  ... predictated_rating
0        0  ...                3.0
1        1  ...                4.0
2        3  ...                2.0
3        4  ...                1.0
4        6  ...                4.0
...    ...  ...                ...
3317  3812  ...                4.0
3318  3813  ...                1.0
3319  3814  ...                4.0
3320  3815  ...                1.0
3321  3817  ...                5.0

[3322 rows x 3 columns]
