In [4]:
import re
import pandas
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

nltk.download("stopwords")
nltk.download('omw-1.4')
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("word_tokenize")
nltk.download('averaged_perceptron_tagger')


def data_cleaning(review):
    review = review.lower()
    review = re.sub(r'\W', ' ', review)
    review = re.sub(r'\s+', ' ', review)
    review = re.sub(r'[0-9]+', ' ', review)
    words = nltk.word_tokenize(review)
    new_words = ' '.join(lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english'))
    return new_words


lemmatizer = WordNetLemmatizer()


train_data = pandas.read_table("train_file.dat", sep="\t", names = ['Point', 'Review'])
train_data['Review'].fillna('', inplace = True)
with open("1675140109_010778_1567602457_126649_test.dat") as file:
    test = file.read()
test_data = pandas.DataFrame(test.splitlines(), columns = ['Comment'])


train_data['Review'] = train_data['Review'].apply(data_cleaning)
test_data['Comment'] = test_data['Comment'].apply(data_cleaning)


x_train, x_test, y_train, y_test = train_test_split(train_data["Review"], train_data["Point"], test_size=0.25, random_state=0)


tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
count_vec = CountVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')


# with Count Vectorization
X_train_bow = count_vec.fit_transform(x_train)
#X_test_bow = count_vec.transform(x_test)
X_test_bow = count_vec.transform(test_data['Comment'])
model_lg = LogisticRegression()
model_lg.fit(X_train_bow, y_train)
predict = model_lg.predict(X_test_bow)
#print("Accuracy Using Count Vectorization: ", str(metrics.accuracy_score(y_test,predict)))


# with TF-IDF
X_train_bow = tfidf_vec.fit_transform(x_train)
#X_test_bow = tfidf_vec.transform(x_test)
X_test_bow = tfidf_vec.transform(test_data['Comment'])
model_lg = LogisticRegression()
model_lg.fit(X_train_bow, y_train)
predict = model_lg.predict(X_test_bow)
#print("Accuracy Using TF-IDF: ", str(metrics.accuracy_score(y_test,predict)))


final_predictions = pandas.DataFrame({"Ratings":predict})
final_predictions["Ratings"].to_csv("Final.csv", header = False, index = False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading word_tokenize: Package 'word_tokenize' not
[nltk_data]     found in index
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver optio