# Goal 
Leverage Entity in [previous sentiment analysis](https://github.com/MMaggieZhou/sentiment_analysis/blob/main/twitter_entity_sentiment_analysis.ipynb)

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import joblib
import spacy

# Load, Clean and Preprocess Data (same as v1)

In [4]:
def load_data(file): 
    return pd.read_csv(file, names=['id', 'entity', 'label', 'text']).set_index('id')

nlp = spacy.load("en_core_web_sm")
def preprocesss_text(text):
    doc = nlp(text)
    ret = []
    for token in doc:
        if token.is_stop: 
            continue 
        ret.append(token.lemma_)
    return ' '.join(ret)
    
label_encoder = LabelEncoder()
def preprocess(df, training): 
    if training: 
        df['label_num'] = label_encoder.fit_transform(df['label'])
    else: 
        df['label_num'] = label_encoder.transform(df['label'])
    df['text_processed'] = df['text'].apply(preprocesss_text)
    return df.drop_duplicates().dropna()

training_df = load_data("~/sentiment_analysis/archive/twitter_training.csv")
training_df = training_df.drop_duplicates().dropna()
training_df = preprocess(training_df, training=True)
test_df = load_data("~/sentiment_analysis/archive/twitter_validation.csv")
test_df = test_df.drop_duplicates().dropna()
test_df = preprocess(test_df, training=False)

In [5]:
training_df.to_csv("~/sentiment_analysis/archive/training_preprocessed.csv")
test_df.to_csv("~/sentiment_analysis/archive/test_preprocessed.csv")

In [6]:
training_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70958 entries, 2401 to 9200
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   entity          70958 non-null  object
 1   label           70958 non-null  object
 2   text            70958 non-null  object
 3   label_num       70958 non-null  int64 
 4   text_processed  70958 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.2+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 3364 to 6960
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   entity          1000 non-null   object
 1   label           1000 non-null   object
 2   text            1000 non-null   object
 3   label_num       1000 non-null   int64 
 4   text_processed  1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 46.9+ KB


In [7]:
vectorizer = TfidfVectorizer()
training_text_encoded = vectorizer.fit_transform(training_df['text_processed'])
test_text_encoded = vectorizer.transform(test_df['text_processed'])

# Encode Entity

In [8]:
# process entity 
from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder(handle_unknown='ignore')
oh_encoder.fit(training_df[['entity']])
training_entity_encoded = oh_encoder.transform(training_df[['entity']])
test_entity_encoded = oh_encoder.transform(test_df[['entity']])

# Combine Entity Encoding and Text Encoding

In [9]:
from scipy.sparse import hstack
training_X = hstack([training_text_encoded, training_entity_encoded])
test_X = hstack([test_text_encoded, test_entity_encoded])

# Model Training and Evaluation

In [12]:
models = {
    #'NaiveBayes': MultinomialNB(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear', C=1),
}

In [13]:
for model in models.values(): 
    model.fit(training_X, training_df['label_num'])

In [15]:
for name, model in models.items():
    training_score = model.score(training_X, training_df['label_num'])
    testing_score = model.score(test_X, test_df['label_num'])
    print(name, ' training score (bias): ', training_score * 100, '; testing score (variance): ', testing_score * 100)

LogisticRegression  training score (bias):  86.36376448039685 ; testing score (variance):  90.10000000000001
SVM  training score (bias):  89.94757462160715 ; testing score (variance):  93.30000000000001


**Results from v1**

LogisticRegression  training score (bias):  85.8972913554497 ; testing score (variance):  90.5

SVM  training score (bias):  89.15132895515657 ; testing score (variance):  93.60000000000001