# Machine Learning

In [112]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_extraction.text import CountVectorizer

filepath_dict = {'yelp':   'sentiment_labelled_sentences/yelp_labelled.txt',
                 'amazon': 'sentiment_labelled_sentences/amazon_cells_labelled.txt',
                 'imdb':   'sentiment_labelled_sentences/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])
df.head()

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [113]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [114]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [115]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [116]:
df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
   sentences, y, test_size=0.25, random_state=1000)

In [117]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [118]:


classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.796


In [119]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


# Stretch Goal

In [120]:
sentences2 = ['I like Chocolate, But Dont each too much']

In [121]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences2)
vectorizer.vocabulary_

{'like': 4,
 'Chocolate': 1,
 'But': 0,
 'Dont': 2,
 'each': 3,
 'too': 6,
 'much': 5}

In [122]:
a=vectorizer.transform(sentences2).toarray()
a

array([[1, 1, 1, 1, 1, 1, 1]])

In [123]:
# from collections import Counter
# import numpy as np
# # Counter(np.ndarray(a))
# Counter(a)

In [124]:
list(a)

[array([1, 1, 1, 1, 1, 1, 1])]

In [125]:
def check(test):
    if test.count(1)> test.count(0):
        print('good review')
    else :
        print('bad review')

In [126]:
# a=vectorizer.transform(sentences2).toarray()
li=list(a[0])
check(li)


good review


In [127]:
sentences3 = ['It was so BORING!','Horrible!']
test1=vectorizer.transform(sentences3).toarray()
li2=list(test1[0])
check(li2)

bad review
