In [22]:
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [23]:
stemmer = WordNetLemmatizer()
# Regular expressions
NEWLINE_RE = r"<\w{1,}\s{1,}/>" # html
NOT_WORD = r"\W" # special char
SINGLE_CHAR = r"\^[a-zA-Z]\s+" # single chars
MULT_SPACE = r"\s+" # mult spaces -> one space

# Read data
data = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
submission = pd.read_csv('submission.csv')

In [24]:
# Remove redundant substrings
def cleaner(string, regex, subst):
    string = re.sub(regex, subst, string)
    return string

def less3(string):
    string = string.split(" ")
    new_string = []
    for i in range(len(string)):
        if len(string[i]) > 2:
            new_string.append(string[i])
    return " ".join(new_string)

def stm(string, stemmer):
    string = string.lower().split(" ")
    new_string = [stemmer.lemmatize(x) for x in string]
    return " ".join(new_string)

data["review"] = data["review"].apply(cleaner, args = (NEWLINE_RE, " "))
data["review"] = data["review"].apply(cleaner, args = (NOT_WORD, " "))
data["review"] = data["review"].apply(cleaner, args = (MULT_SPACE, " "))
data["review"] = data["review"].apply(less3)
data["review"] = data["review"].apply(stm, args = (stemmer,))

test["review"] = test["review"].apply(cleaner, args = (NEWLINE_RE, " "))
test["review"] = test["review"].apply(cleaner, args = (NOT_WORD, " "))
test["review"] = test["review"].apply(cleaner, args = (MULT_SPACE, " "))
test["review"] = test["review"].apply(less3)
test["review"] = test["review"].apply(stm, args = (stemmer,))

X = data.drop(["sentiment"],1)["review"]
Y = data["sentiment"]
X_sub = test['review']  # for future submission

In [25]:
# Vectorization data
vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=2, max_df=0.59)
X = vectorizer.fit_transform(X)
X_sub = vectorizer.transform(X_sub) # for future submission
X, X_sub = normalize(X, norm = "max"), normalize(X_sub, norm = "max")
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)


In [26]:
# Logistic Regression
model = LogisticRegression(random_state = 0, solver = "sag", max_iter = 1000, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print(f'Logistic Regression score -- > {score}')
y_pred = model.predict(x_test)
y_pred_sub = model.predict(X_sub)
print(f'Logistic Regression F1 score -- > {f1_score(y_pred, y_test)}')
print('-' * 50)

Logistic Regression score -- > 0.908057590300581
Logistic Regression F1 score -- > 0.9094977623073098
--------------------------------------------------


In [27]:
# Write results (of Log Reg) into .csv
submission['sentiment'] = y_pred_sub
submission.to_csv('results.csv', index=False)