# Lab: Machine Learning Intro

## load the data with Pandas:



In [48]:
import pandas as pd

filepath_dict = {'yelp':   'files/yelp_labelled.txt',
                 'amazon': 'files/amazon_cells_labelled.txt',
                 'imdb':   'files/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [49]:
sentences = ['John likes ice cream', 'John hates chocolate.']

##  create a feature vector of the count of the words:

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

## the CountVectorizer you will get a vector representing the count of each word of the sentence:

In [51]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

## take the sentences and labels. The .values returns a NumPy array instead of a Pandas Series object :

In [52]:
from sklearn.model_selection import train_test_split
df_amazon = df[df['source'] == 'amazon']
sentences = df_amazon['sentence'].values
y = df_amazon['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

## create the feature vectors for each sentence of the training and testing set:

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1546 sparse matrix of type '<class 'numpy.int64'>'
	with 6817 stored elements in Compressed Sparse Row format>

## use again scikit-learn library which provides the LogisticRegression classifier:

In [54]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.796


## perform and evaluate the whole process for each data set that we have:

In [55]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


## take the sentences and labels. The .values returns a NumPy array instead of a Pandas Series object :

In [56]:
from sklearn.model_selection import train_test_split
df_amazon = df[df['source'] == 'amazon']
sentences = df_amazon['sentence'].values
y = df_amazon['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

## create the feature vectors for each sentence of the training and testing set:

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1546 sparse matrix of type '<class 'numpy.int64'>'
	with 6817 stored elements in Compressed Sparse Row format>

## use again scikit-learn library which provides the LogisticRegression classifier:

In [58]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.796


## take the sentences and labels. The .values returns a NumPy array instead of a Pandas Series object :

In [59]:
from sklearn.model_selection import train_test_split
df_imdb = df[df['source'] == 'imdb']
sentences = df_imdb['sentence'].values
y = df_imdb['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

## create the feature vectors for each sentence of the training and testing set:

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<561x2505 sparse matrix of type '<class 'numpy.int64'>'
	with 8413 stored elements in Compressed Sparse Row format>

## use again scikit-learn library which provides the LogisticRegression classifier:

In [61]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.7486631016042781


## Summary:

  - you have learned how to work with text classification.
  -  and we have gone from a bag-of-words model with logistic regression to increasingly more advanced methods leading to convolutional neural networks.
  -  how to use hyperparameter optimization to squeeze more performance out of your model.
  - understanding of a crucial cornerstone in natural language processing.
  - how to use for text classification of all sorts.
  - Sentiment analysis is the most prominent example for this.
  - You could also combine sentiment analysis or text classification with speech recognition, using the SpeechRecognition library in Python.