## 4.1.4 Logistic Regression Example with TF-IDF

### TF-IDF Feature Example

In [None]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
DATA_IN_PATH = './data_in/' 
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [None]:
train_data = pd.read_csv( DATA_IN_PATH + TRAIN_CLEAN_DATA )

In [None]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [None]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) 

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [None]:
X

In [None]:
features = vectorizer.get_feature_names()

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [None]:
lgs = LogisticRegression(class_weight='balanced') 
lgs.fit(X_train, y_train) 

In [None]:
predicted = lgs.predict(X_eval)

In [None]:
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

In [None]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

In [None]:
testDataVecs = vectorizer.transform(test_data['review'])

In [None]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)