# Sentiment Analyzer

In [None]:
Based off [Practical Text Classification With Python and Keras](https://realpython.com/python-keras-text-classification/)

In [None]:
import pandas as pd

In [None]:
filepath_dict = {'yelp':   'data/sentiment_analysis/yelp_labelled.txt',
                 'amazon': 'data/sentiment_analysis/amazon_cells_labelled.txt',
                 'imdb':   'data/sentiment_analysis/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
df.head()

## Data analysis likes to "vectorize" data to be more efficient to work with.

In [None]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=False, min_df=1)
vectorizer.fit(sentences)
vectorizer.vocabulary_

In [None]:
vectorizer.transform(sentences)

In [None]:
vectorizer.transform(sentences).toarray()

Let's write a little function in plain old Python to see how to make sense of how the pieces fit together

In [None]:
def report(vectorizer, sentences):

    vectorizer.fit(sentences)


    items = vectorizer.vocabulary_.items()

    matrix = vectorizer.transform(sentences)

    transformed_array = matrix.toarray()

    for i, sentence in enumerate(sentences):
        print("\nSentence:", sentence, "\n")
        transformed_sentence = transformed_array[i]
        for index, value in enumerate(transformed_sentence):
            for item in items:
                if item[1] == index:
                    if value:
                        print("\t", item[0], ": found")
                    else:
                        print("\t", item[0], ": not found")
                        

report(CountVectorizer(), sentences)



As vocabulary grows a LOT more items will not be found in given sentence

## Let's move on to one of the full data sets

In [None]:
from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
   sentences, y, test_size=0.25, random_state=1000)

In [None]:
len(sentences_train)

In [None]:
sentences_train[:4]

In [None]:
y_train[:4]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

In [None]:
X_test

1714 words available
each sentence uses some of them
sentence = [0,0,0,0,1,0,0,0,1,0,0......]

In [None]:
from sklearn.linear_model import LogisticRegression

# S shape sigmoid   
#         ________
#        |
#        |
#  _______

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

## Let's do all 3 data sets

In [None]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))