## 4.1.4 Linear Regression Example with TF-IDF

### TF-IDF Feature Example

In [None]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
DEFAULT_PATH = './data_in/' 
DATA_OUT_PATH = './data_out/'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [None]:
train = pd.read_csv(DEFAULT_PATH + "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [None]:
reviews = list(train['review'])
sentiments = list(train['sentiment'])

In [None]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) #converting data to vectors

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [None]:
features = vectorizer.get_feature_names()

In [None]:
print(X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [None]:
lgs = LogisticRegression(class_weight='balanced') 
lgs.fit(X_train, y_train) 

In [None]:
predicted = lgs.predict(X_test)

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))

In [None]:
auc = metrics.auc(fpr, tpr)

In [None]:
print("Accuracy: %f" % lgs.score(X_test, y_test))  
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

In [None]:
test = pd.read_csv(DEFAULT_PATH + "testData.tsv", header=0, delimiter="\t", quoting=3)

In [None]:
testDataVecs = vectorizer.transform(test['review'])

In [None]:
test_predicted = lgs.predict(testDataVecs)

In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)