# Transfer Learning MNIST

* Train a simple convnet on the MNIST dataset the first 5 digits [0..4].
* Freeze convolutional layers and fine-tune dense layers for the classification of digits [5..9].

## 1. Import necessary libraries for the model

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [48]:
import datetime
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

Using TensorFlow backend.


In [0]:
now = datetime.datetime.now

batch_size = 128
num_classes = 5
epochs = 5

# input image dimensions
img_rows, img_cols = 28, 28

# number of convolutional filters to use
filters = 32

# size of pooling area for max pooling
pool_size = 2

# convolution kernel size
kernel_size = 3

In [0]:
if K.image_data_format() == 'channels_first':
    input_shape = (1, img_rows, img_cols)
else:
    input_shape = (img_rows, img_cols, 1)

## 2. Import MNIST data and create 2 datasets with one dataset having digits from 0 to 4 and other from 5 to 9 

In [51]:
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


In [0]:
# datasets one with digits below 5 and one with 5 and above
x_train_lt5 = x_train[y_train < 5]
y_train_lt5 = y_train[y_train < 5]
x_test_lt5 = x_test[y_test < 5]
y_test_lt5 = y_test[y_test < 5]

x_train_gte5 = x_train[y_train >= 5]
y_train_gte5 = y_train[y_train >= 5] - 5
x_test_gte5 = x_test[y_test >= 5]
y_test_gte5 = y_test[y_test >= 5] - 5

## 3. Print x_train, y_train, x_test and y_test for both the datasets

In [54]:
print('x_train shape:', x_train.shape)
print(x_train.shape[0], ' train samples')
print(x_test.shape[0], ' test samples')

x_train shape: (60000, 28, 28)
60000  train samples
10000  test samples


In [55]:
print('y_train shape:', y_train.shape)
print(y_train.shape[0], ' train samples')
print(y_test.shape[0], ' test samples')

y_train shape: (60000,)
60000  train samples
10000  test samples


## ** 4. Let us take only the dataset (x_train, y_train, x_test, y_test) for Integers 0 to 4 in MNIST **
## Reshape x_train and x_test to a 4 Dimensional array (channel = 1) to pass it into a Conv2D layer

In [0]:
Xtrain_lt5 = x_train_lt5.reshape((x_train_lt5.shape[0],) + input_shape)
Xtest_lt5 = x_test_lt5.reshape((x_test_lt5.shape[0],) + input_shape)

## 5. Normalize x_train and x_test by dividing it by 255

In [0]:

Xtrain_lt5 = Xtrain_lt5.astype('float32')
Xtest_lt5 = Xtest_lt5.astype('float32')

In [0]:
Xtrain_lt5 /= 255
Xtest_lt5 /= 255

## 6. Use One-hot encoding to divide y_train and y_test into required no of output classes

In [0]:
# convert class vectors to binary class matrices
Ytrain_lt5 = keras.utils.to_categorical(y_train_lt5, num_classes)
Ytest_lt5 = keras.utils.to_categorical(y_test_lt5, num_classes)

## 7. Build a sequential model with 2 Convolutional layers with 32 kernels of size (3,3) followed by a Max pooling layer of size (2,2) followed by a drop out layer to be trained for classification of digits 0-4  

## 8. Post that flatten the data and add 2 Dense layers with 128 neurons and neurons = output classes with activation = 'relu' and 'softmax' respectively. Add dropout layer inbetween if necessary  

In [60]:
# two groups of layers: feature (convolutions) and classification (dense)
feature_layers = [
    Conv2D(filters, kernel_size, padding='valid', input_shape=input_shape),
    Activation('relu'),
    Conv2D(filters, kernel_size),
    Activation('relu'),
    MaxPooling2D(pool_size=pool_size),
    Dropout(0.25),
    Flatten(),
]

classification_layers = [
    Dense(128),
    Activation('relu'),
    Dropout(0.5),
    Dense(num_classes),
    Activation('softmax')
]

W0616 12:38:10.631964 140539759208320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [61]:
# create complete model
model = Sequential(feature_layers + classification_layers)

W0616 12:39:03.681552 140539759208320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0616 12:39:03.693982 140539759208320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0616 12:39:03.740486 140539759208320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0616 12:39:03.742978 140539759208320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0616 12:39:03.752383 140539759208320 de

In [62]:
model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])

W0616 12:39:16.729054 140539759208320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0616 12:39:16.757362 140539759208320 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [63]:
time_stamp_lt5 = now()

model.fit(Xtrain_lt5, Ytrain_lt5, 
          batch_size = batch_size, 
          epochs = epochs, 
          verbose = 1, 
          validation_data = (Xtest_lt5, Ytest_lt5))

W0616 12:39:26.195399 140539759208320 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 30596 samples, validate on 5139 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd1a9ba3198>

## 9. Print the training and test accuracy

In [64]:
print('Training time digits below 5: %s' % (now() - time_stamp_lt5))

score_lt5 = model.evaluate(Xtest_lt5, Ytest_lt5, verbose=0)
print(('Test score digits below 5: {0:.2f}%').format(score_lt5[0]*100))
print(('Test accuracy digits below 5: {0:.2f}%').format(score_lt5[1]*100))

Training time digits below 5: 0:00:24.526665
Test score digits below 5: 0.58%
Test accuracy digits below 5: 99.73%


## 10. Make only the dense layers to be trainable and convolutional layers to be non-trainable

In [0]:
# freeze feature layers and rebuild model
for i in feature_layers:
    i.trainable = False

## 11. Use the model trained on 0 to 4 digit classification and train it on the dataset which has digits 5 to 9  (Using Transfer learning keeping only the dense layers to be trainable)

In [0]:
Xtrain_gte5 = x_train_gte5.reshape((x_train_gte5.shape[0],) + input_shape)
Xtest_gte5 = x_test_gte5.reshape((x_test_gte5.shape[0],) + input_shape)

In [0]:
Xtrain_gte5 = Xtrain_gte5.astype('float32')
Xtest_gte5 = Xtest_gte5.astype('float32')

Xtrain_gte5 /= 255
Xtest_gte5 /= 255

In [0]:
Ytrain_gte5 = keras.utils.to_categorical(y_train_gte5, num_classes)
Ytest_gte5 = keras.utils.to_categorical(y_test_gte5, num_classes)

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])

In [70]:
time_stamp_gte5 = now()

model.fit(Xtrain_gte5, Ytrain_gte5, 
          batch_size = batch_size, 
          epochs = epochs, 
          verbose = 1, 
          validation_data = (Xtest_gte5, Ytest_gte5))

Train on 29404 samples, validate on 4861 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd19c080f60>

## 12. Print the accuracy for classification of digits 5 to 9

In [71]:
print('Training time digits one with 5 and above: %s' % (now() - time_stamp_gte5))

score_gte5 = model.evaluate(Xtest_gte5, Ytest_gte5, verbose=0)
print(('Test score digits one with 5 and above: {0:.2f}%').format(score_gte5[0]*100))
print(('Test accuracy digits one with 5 and above: {0:.2f}%').format(score_gte5[1]*100))

Training time digits one with 5 and above: 0:00:14.383868
Test score digits one with 5 and above: 2.17%
Test accuracy digits one with 5 and above: 99.30%


## Sentiment analysis <br> 

The objective of the second problem is to perform Sentiment analysis from the tweets data collected from the users targeted at various mobile devices.
Based on the tweet posted by a user (text), we will classify if the sentiment of the user targeted at a particular mobile device is positive or not.

### 13. Read the dataset (tweets.csv) and drop the NA's while reading the dataset

In [0]:
from google.colab import drive

In [3]:
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [0]:
data =  pd.read_csv("/content/drive/My Drive/Colab Notebooks/Recidency_8/tweets.csv",encoding = "ISO-8859-1",engine='python')

In [74]:
data.shape

(9093, 3)

In [0]:
dat_1 = data.dropna(axis = 0)

In [76]:
dat_1.shape

(3291, 3)

In [77]:
dat_1.columns

Index(['tweet_text', 'emotion_in_tweet_is_directed_at',
       'is_there_an_emotion_directed_at_a_brand_or_product'],
      dtype='object')

In [78]:
#comparing sizes of data frames 
print("Old data frame length:", len(data), 
      "\nNew data frame length:", len(dat_1), 
      "\nNumber of rows with at least 1 NA value: ", (len(data)-len(dat_1)))

Old data frame length: 9093 
New data frame length: 3291 
Number of rows with at least 1 NA value:  5802


In [0]:
dat_1.rename(columns = {'tweet_text':'tweettext',
                          'emotion_in_tweet_is_directed_at':'source',
                          'is_there_an_emotion_directed_at_a_brand_or_product':'sentiment'}, 
               inplace = True)

### 14. Preprocess the text and add the preprocessed text in a column with name `text` in the dataframe.

In [0]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [0]:
def preprocess(text):
    try:
        return remove_special_characters(text, remove_digits=True) #text.decode('ascii')
    except Exception as e:
        return ""

In [0]:
dat_1['text'] = [preprocess(text) for text in dat_1.tweettext]

In [91]:
dat_1.head(5)

Unnamed: 0,tweettext,source,sentiment,text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley I have a G iPhone After hrs tweeting a...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee Know about fludapp Awesome iPadiPhon...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin Can not wait for iPad also They sh...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,sxsw I hope this years festival isnt as crashy...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff on Fri SXSW Marissa Maye...


In [92]:
dat_1.sentiment.value_counts()

Positive emotion                      2672
Negative emotion                       519
No emotion toward brand or product      91
I can't tell                             9
Name: sentiment, dtype: int64

### 15. Consider only rows having Positive emotion and Negative emotion and remove other rows from the dataframe.

In [0]:
Positive_emotion = dat_1[dat_1.sentiment == 'Positive emotion']

In [0]:
Negative_emotion = dat_1[dat_1.sentiment == 'Negative emotion']

In [0]:
data = Positive_emotion.append(Negative_emotion,ignore_index=True)

In [96]:
data.shape

(3191, 4)

### 16. Represent text as numerical data using `CountVectorizer` and get the document term frequency matrix

#### Use `vect` as the variable name for initialising CountVectorizer.

In [0]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1, 1))

In [98]:
vect.fit(data['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### 17. Find number of different words in vocabulary

In [99]:
# examine the fitted vocabulary
len(vect.get_feature_names())

5906

#### Tip: To see all available functions for an Object use dir

In [100]:
dir(vect)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_sort_features',
 '_stop_words_id',
 '_validate_custom_analyzer',
 '_validate_params',
 '_validate_vocabulary',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',
 'fit_transform',
 'fixed_vocabulary_',
 'get_feature_names',
 'get_params',
 'get_stop_words',
 'input',
 'inverse_transf

### 18. Find out how many Positive and Negative emotions are there.

Hint: Use value_counts on that column

In [101]:
data.sentiment.value_counts()

Positive emotion    2672
Negative emotion     519
Name: sentiment, dtype: int64

### 19. Change the labels for Positive and Negative emotions as 1 and 0 respectively and store in a different column in the same dataframe named 'Label'

Hint: use map on that column and give labels

In [102]:
data["target"] = data.sentiment.map(lambda x: 1 if x == 'Positive emotion' else 0)
data["target"].value_counts()

1    2672
0     519
Name: target, dtype: int64

### 20. Define the feature set (independent variable or X) to be `text` column and `labels` as target (or dependent variable)  and divide into train and test datasets

In [0]:
data_dtm = vect.transform(data['text'])

In [104]:
# examine the vocabulary and document-term matrix together
data_dtm_1 = pd.DataFrame(data_dtm.toarray(), columns=vect.get_feature_names())
data_dtm_1.head()

Unnamed: 0,__,______,_______,_______quot,aapl,abacus,abandoned,aber,able,about,abroad,absolute,absolutely,abt,abuzz,academy,acc,acceptable,access,accessibility,accessible,accessories,accessory,accesssxsw,accommodate,according,accordion,account,acerbic,achieve,acknowledge,aclu,aclugoogle,aclus,acquired,across,acrosse,action,actionquot,actions,...,york,you,you_,youd,youll,youneedthis,youquot,your,youre,yours,yourself,youtube,youve,yowza,yr,yrs,yrsday,yummy,yup,zaarly,zaarlyiscoming,zagg,zaggle,zappos,zazzle,zazzlesxsw,zazzlsxsw,ze,zelda,zeldman,zero,zimride,zip,zite,zms,zombies,zomg,zone,zoom,zzzs
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 21. **Predicting the sentiment:**


### Use Naive Bayes and Logistic Regression and their accuracy scores for predicting the sentiment of the given text

In [105]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_dtm_1, data['target'], test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(2552, 5906) (2552,)
(639, 5906) (639,)


In [0]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [107]:
# train the model using X_train_dtm
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test)

In [109]:
# calculate accuracy of class predictions
from sklearn import metrics
print(('Accuracy: {0:.2f}%').format(metrics.accuracy_score(y_test, y_pred_class)*100))

Accuracy: 87.32%


In [110]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 40,  50],
       [ 31, 518]])

In [0]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)

In [112]:
y_pred_class = clf.predict(X_test)
print(('Accuracy: {0:.2f}%').format(metrics.accuracy_score(y_test, y_pred_class)*100))

Accuracy: 89.20%


In [113]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 44,  46],
       [ 23, 526]])

## 22. Create a function called `tokenize_predict` which can take count vectorizer object as input and prints the accuracy for x (text) and y (labels)

In [0]:
def tokenize_test(vect):
    x_train_dtm = vect.fit_transform(x_train)
    print('Features: ', x_train_dtm.shape[1])
    x_test_dtm = vect.transform(x_test)
    nb = MultinomialNB()
    nb.fit(x_train_dtm, y_train)
    y_pred_class = nb.predict(x_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)*100)

### Create a count vectorizer function which includes n_grams = 1,2  and pass it to tokenize_predict function to print the accuracy score

In [115]:

x_train, x_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2)
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  25657
Accuracy:  87.79342723004694


### Create a count vectorizer function with stopwords = 'english'  and pass it to tokenize_predict function to print the accuracy score

In [116]:
vect = CountVectorizer(ngram_range=(1, 2), stop_words='english')
tokenize_test(vect)

Features:  19821
Accuracy:  87.79342723004694


### Create a count vectorizer function with stopwords = 'english' and max_features =300  and pass it to tokenize_predict function to print the accuracy score

In [117]:
vect = CountVectorizer(ngram_range=(1, 2),stop_words='english',max_features =300)
tokenize_test(vect)

Features:  300
Accuracy:  78.87323943661971


### Create a count vectorizer function with n_grams = 1,2  and max_features = 15000  and pass it to tokenize_predict function to print the accuracy score

In [118]:
vect = CountVectorizer(ngram_range=(1, 2),stop_words='english',max_features =15000)
tokenize_test(vect)

Features:  15000
Accuracy:  87.63693270735524


### Create a count vectorizer function with n_grams = 1,2  and include terms that appear at least 2 times (min_df = 2)  and pass it to tokenize_predict function to print the accuracy score

In [119]:
vect = CountVectorizer(ngram_range=(1, 2),stop_words='english',max_features =15000,min_df=2)
tokenize_test(vect)

Features:  5780
Accuracy:  87.01095461658842
