##### Your task is to create a baseline classification model by following (and modifying when needed) the Practical Text Classification With Python and Keras tutorial into a running Jupyter Notebook.

In [1]:
import pandas as pd

In [2]:
file_path_dict={ 'yelp':'data/sentiment_analysis/yelp_labelled.txt',
                'amazon':'data/sentiment_analysis/amazon_cells_labelled.txt',
                'imdb':'data/sentiment_analysis/imdb_labelled.txt'}

In [3]:
df_list = []
for source,filepath in file_path_dict.items():
    df = pd.read_csv(filepath,names=['sentence', 'label'],sep='\t')
    df['source']=source
    df_list.append(df)
    
df =pd.concat(df_list)
# print(df.iloc[100])
print(df)

                                              sentence  label source
0                             Wow... Loved this place.      1   yelp
1                                   Crust is not good.      0   yelp
2            Not tasty and the texture was just nasty.      0   yelp
3    Stopped by during the late May bank holiday of...      1   yelp
4    The selection on the menu was great and so wer...      1   yelp
..                                                 ...    ...    ...
743  I just got bored watching Jessice Lange take h...      0   imdb
744  Unfortunately, any virtue in this film's produ...      0   imdb
745                   In a word, it is embarrassing.        0   imdb
746                               Exceptionally bad!        0   imdb
747  All in all its an insult to one's intelligence...      0   imdb

[2748 rows x 3 columns]


In [4]:
sentences = ['John likes ice cream', 'John hates chocolate.']# array to test 
from sklearn.feature_extraction.text import CountVectorizer# takes the words of each sentence and creates a vocabulary of all the unique words in the sentences. 
vectorizer =CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_ 

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [5]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

### Defining a Baseline Model


##### split the data into a training and testing set to evaluate the accuracy

In [6]:
from sklearn.model_selection import train_test_split
# ------------------- yelp section
df_yelp = df[df['source']=='yelp']
sentences = df_yelp['sentence'].values
y =df_yelp['label'].values
sentences_train, sentences_test, y_train, y_test= train_test_split(sentences, y, test_size=0.25,random_state=1000)


# ------------------- imdb section


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [8]:
>>> from sklearn.linear_model import LogisticRegression

X_test = vectorizer.transform(sentences_test)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.796


In [9]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
