# Text Classification

In [1]:
import pandas as pd

In [4]:
dict_filepath= { 'yelp': 'sentiment_sentences/yelp_labelled.txt',
'amazon': 'sentiment_sentences/amazon_cells_labelled.txt',
'imdb': 'sentiment_sentences/imdb_labelled.txt'
}

df_list = []
for company, filepath in dict_filepath.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['company'] = company
    df_list.append(df)
df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
company                         yelp
Name: 0, dtype: object


In [5]:
df.head()

Unnamed: 0,sentence,label,company
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2748 entries, 0 to 747
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  2748 non-null   object
 1   label     2748 non-null   int64 
 2   company   2748 non-null   object
dtypes: int64(1), object(2)
memory usage: 85.9+ KB


## How We Would Predict The Data


'''
A collection of text is called a CORPUS

 In a feature vector, each dimension can be a numeric or categorical feature, like for example the height of a building, the price of a stock, or, in our case, the count of a word in a vocabulary. These feature vectors are a crucial piece in data science and machine learning, as the model you want to train depends on them.
'''

In [7]:
sentences = ['John likes ice cream', 'John hates chocolate.']

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [8]:
vectorizer.transform(sentences).toarray()

# Resulting feature vectors for each sentence based on previpus vocabulary
'''
For example, if you take a look at the first item, you can see that both vectors have a 1 there. This means that both sentences have one occurrence of John, which is in the first place in the vocabulary.
'''

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

## Defining a Baseline Model

Overfitting is when a model is trained too well on the training data. You want to avoid overfitting, as this would mean that the model mostly just memorized the training data. This would account for a large accuracy with the training data but a low accuracy in the testing data.

In [15]:
#The .values returns a NumPy array instead of a Pandas Series object which is in this context easier to work with:

'''
   'the presentation of the food was awful.',
       "I can't tell you how disappointed I was.",
       'I think food should have flavor and texture and both were lacking.',
       'Appetite instantly gone.',
       'Overall I was not impressed and would not go back.',
       "The whole experience was underwhelming, and I think we'll just go to Ninja Sushi next time.",
       "Then, as if I hadn't wasted enough of my life there, they poured salt in the wound by drawing out the time it took to bring the check."]
'''


from sklearn.model_selection import train_test_split
df_yelp = df[df['company'] == 'yelp']
sentences = df_yelp['sentence'].values


In [16]:
y = df_yelp['label'].values

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2748 entries, 0 to 747
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  2748 non-null   object
 1   label     2748 non-null   int64 
 2   company   2748 non-null   object
dtypes: int64(1), object(2)
memory usage: 85.9+ KB


In [19]:
sentences_train, sentences_test, y_train, y_test = train_test_split(
sentences, y, test_size=0.25, random_state=1000)

### Vectorize and Train

In [20]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
X_train
'''
<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>'''

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

 ## A SIMPLE MODEL

In [24]:
from sklearn.linear_model import LogisticRegression

In [27]:
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train) # fit the data to a logisticc regression
    score = classifier.score(X_test,y_test)
    print("Accuracy: ", score)


Accuracy:  0.796


In [28]:
for company in df['company'].unique():
    df_company = df[df['company'] == company]
    sentences = df_company['sentence'].values # turn all into np array
    y = df_company['label'].values # turn all into np array

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(company, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


# A Primer on (Deep) Neural Networks