In [16]:
import pandas as pd

In [17]:
filepath_dict = {'yelp': 'data/yelp_labelled.txt',
                'amazon': 'data/amazon_cells_labelled.txt',
                'imdb': 'data/imdb_labelled.txt'}

In [18]:
filepath_dict['yelp']

'data/yelp_labelled.txt'

In [19]:
df_list = []

for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)

In [20]:
df_list

[                                              sentence  label source
 0                             Wow... Loved this place.      1   yelp
 1                                   Crust is not good.      0   yelp
 2            Not tasty and the texture was just nasty.      0   yelp
 3    Stopped by during the late May bank holiday of...      1   yelp
 4    The selection on the menu was great and so wer...      1   yelp
 ..                                                 ...    ...    ...
 995  I think food should have flavor and texture an...      0   yelp
 996                           Appetite instantly gone.      0   yelp
 997  Overall I was not impressed and would not go b...      0   yelp
 998  The whole experience was underwhelming, and I ...      0   yelp
 999  Then, as if I hadn't wasted enough of my life ...      0   yelp
 
 [1000 rows x 3 columns],
                                               sentence  label  source
 0    So there is no way for me to plug it in here i...      

In [21]:
df = pd.concat(df_list)

In [22]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
df_yelp = df[ df['source']=='yelp']

In [25]:
df_yelp.head(5)

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [27]:
X = df_yelp['sentence'].values

In [28]:
Y = df_yelp['label'].values

In [31]:
X[:5]

array(['Wow... Loved this place.', 'Crust is not good.',
       'Not tasty and the texture was just nasty.',
       'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
       'The selection on the menu was great and so were the prices.'],
      dtype=object)

In [32]:
Y[:5]

array([1, 0, 0, 1, 1])

In [33]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

In [35]:
x_train[:10]

array(['We will not be coming back.',
       'We waited for forty five minutes in vain.',
       "I could barely stomach the meal, but didn't complain because it was a business lunch.",
       "I'd rather eat airline food, seriously.",
       "Needless to say, I won't be going back anytime soon.",
       'For that price I can think of a few place I would have much rather gone.',
       'To my disbelief, each dish qualified as the worst version of these foods I have ever tasted.',
       "I promise they won't disappoint.",
       'The decor is nice, and the piano music soundtrack is pleasant.',
       'Will go back next trip out.'], dtype=object)

## Shape the data in an acceptable shape by Regression Model

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
vectorizer = CountVectorizer()

In [39]:
vectorizer.fit(x_train)

CountVectorizer()

In [40]:
transformed_x_train = vectorizer.transform(x_train)

In [41]:
transformed_x_test = vectorizer.transform(x_test)

In [58]:
transformed_x_train

<750x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 7422 stored elements in Compressed Sparse Row format>

## Train/classify the data using Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression

In [62]:
# our model
classifier = LogisticRegression()

In [64]:
classifier.fit(transformed_x_train, y_train)

LogisticRegression()

In [65]:
score = classifier.score(transformed_x_test, y_test)

In [66]:
score

0.808

## Apply the above on All type of data (yelp, amazon, and imdb)