In [1]:
# Read data into Python
import pandas as pd

reviews_dataset = pd.read_csv('Dataset.txt', names=['sentence', 'label'], sep="\t")

In [2]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer

sentences = reviews_dataset['sentence']
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.transform(sentences)

<748x3324 sparse matrix of type '<class 'numpy.int64'>'
	with 11567 stored elements in Compressed Sparse Row format>

In [3]:
# Defining a baseline model
sentences = reviews_dataset['sentence'].values
y = reviews_dataset['label'].values


In [4]:
# Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, y, train_size=0.75)

In [5]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)


CountVectorizer()

In [6]:
# Pre-processing
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*?><br\s*/>)|(\-)|(\/)")


def preprocessing_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]

    return reviews


sentences_train = preprocessing_reviews(X_train)
sentences_test = preprocessing_reviews(X_test)

In [7]:
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

In [8]:
# The Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

for c in [0.01, 0.05, 0.25, 0.5, 1, 2, 3, 5, 10]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, lr.predict(X_test))))


Accuracy for C=0.01: 0.6577540106951871
Accuracy for C=0.05: 0.7272727272727273
Accuracy for C=0.25: 0.7540106951871658
Accuracy for C=0.5: 0.7593582887700535
Accuracy for C=1: 0.7754010695187166
Accuracy for C=2: 0.7647058823529411
Accuracy for C=3: 0.7593582887700535
Accuracy for C=5: 0.7647058823529411
Accuracy for C=10: 0.7486631016042781
