In [0]:
from google.colab import drive
drive.mount('/content/drive')

# CV and Transfer Learning

Q1. Import tensorflow (2.x Mandatory)

*   Import other required libraries

In [2]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import cifar10
import os

TensorFlow 2.x selected.


Q2. Load CIFAR10 dataset from keras and split into train and test
*    Identify shape of x_train and y_train

In [0]:
(trainX, trainY), (testX, testY) = cifar10.load_data()

In [0]:
batch_size = 32
epochs = 20
num_classes = 10
model_name = 'keras_cifar10_trained_model.h5'

Q3.

*   Transform x_train and x_test on scale of 0-1
*   Transform y_train and y_test to categories



In [0]:
trainX = trainX.astype('float32')
testX = testX.astype('float32')
trainX /= 255
testX /= 255

# Convert class vectors to binary class matrices.
y_train = to_categorical(trainY, num_classes=num_classes)
y_test = to_categorical(testY, num_classes=num_classes)

Q4. Import necessary packages required for Model building
*   Conv2D, Dense, Flatten, Dropout, MaxPooling2D etc.

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization, Conv2D, Flatten, Dense, Dropout, Reshape, MaxPool2D, Activation, MaxPooling2D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator

Q5. Prepare a CNN
 
*   Which will include above layers
*   Freely create your own Architecture and Arguments
*   Print Model Summary

In [0]:
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=trainX.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 32, 32, 32)        896       
_________________________________________________________________
activation (Activation)      (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 30, 30, 32)        9248      
_________________________________________________________________
activation_1 (Activation)    (None, 30, 30, 32)        0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 15, 15, 32)        0         
_________________________________________________________________
dropout (Dropout)            (None, 15, 15, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 15, 15, 64)        1

In [0]:
datagen = ImageDataGenerator(
        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
        # randomly shift images horizontally (fraction of total width)
        width_shift_range=0.1,
        # randomly shift images vertically (fraction of total height)
        height_shift_range=0.1,
        fill_mode='nearest',
        horizontal_flip=True,  # randomly flip images
        vertical_flip=True,  # randomly flip images
        # fraction of images reserved for validation (strictly between 0 and 1)
        validation_split=0.0)

datagen.fit(trainX)

Q6. Train the CNN

*   Compile the model
*   Fit the model (10 epochs, 32 batch size)
*   Evaluate Model Performance

In [14]:
# Fit the model on the batches generated by datagen.flow().
model.fit(datagen.flow(trainX, y_train,
                    batch_size=batch_size),
                    epochs=epochs,
                    validation_data=(testX, y_test),
                    workers=4)

  ...
    to  
  ['...']
Train for 1563 steps, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2cd811c6d8>

In [16]:
save_dir = os.path.join(os.getcwd(), 'saved_models')
# Save model and weights
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Saved trained model at %s ' % model_path)

Saved trained model at /content/saved_models/keras_cifar10_trained_model.h5 


In [0]:
# metrics.classification_report(model.predict(testX), y_test)

Q7. Import packages required for VGG16

*   `tf.keras.application`
> VGG16, preprocess_input, decode_predictions
*   `tf.keras.preprocessing`
> load_img, img_to_array



In [0]:
from tensorflow.keras.applications import vgg16
from tensorflow.keras.preprocessing.image import load_img, img_to_array

Q8. Load image


*   Mount Google Drive
*   Navigate to image location (use `os`)
*   Load image and assign a variable (use `load_img`)

In [0]:
image_dir = os.path.join(os.getcwd(), '/content/drive/My Drive/Colab Notebooks/Lab/Datasets/images/')

In [32]:
os.listdir(image_dir)

['49436743043_2441587ab9_c.jpg',
 '49438170746_8378201627_c.jpg',
 '49441887332_107afa786d_c.jpg',
 '4994221690_d070e8a355_c.jpg']

Q9. Preprocess the image


*   Convert image into array (use `img_to_array`)
*   Check shape of image
*   Reshape image into 4 dimensional format (use `reshape`)
*   Prepare the image for VGG16 (Use `preprocess_input()`)

In [0]:
images_array = []
for image_path in os.listdir(image_dir):
    images_array.append(load_img(image_dir + image_path))

In [0]:
for image_index in range(len(images_array)):
    model_input = images_array[image_index].resize((224,224))
    model_input = img_to_array(model_input)
    #Image array should be normalized in same way as was done for VGG training
    model_input = vgg16.preprocess_input(model_input)
    #Add a dimension to input data to make it a 4D input as required by model
    model_input = np.expand_dims(model_input, axis=0)
    images_array[image_index] = model_input

Q10. Predict the Class of image


*   Use `predict()` to calculate probabilities (Assign a variable)
*   Convert the probabilities to class labels (Use `decode_predictions`)(Assign a variable)
*   Print the classification results


> Use 
>*  label = label[0][0]
>*   print('%s (%.2f%%)' % (label[1], label[2]*100))
>*(where label is variable assigned for `decode_predictions` )



In [0]:
#Load VGG Model
model = vgg16.VGG16(include_top=True, #Should we include classification Layers
                                        weights='imagenet', #Load imagenet weights, 'None' will load random weights
                                        input_shape=(224,224,3)) #Input image size

In [0]:
from tensorflow.keras.applications.vgg16 import decode_predictions

In [58]:
#Predict
for index in range(4):
    prediction = model.predict(images_array[index])
    print(decode_predictions(prediction, top=2)[0])

[('n01833805', 'hummingbird', 0.36529157), ('n01828970', 'bee_eater', 0.3073225)]
[('n03393912', 'freight_car', 0.3433729), ('n04467665', 'trailer_truck', 0.23375992)]
[('n02279972', 'monarch', 0.99720985), ('n02281406', 'sulphur_butterfly', 0.0020955447)]
[('n03063599', 'coffee_mug', 0.7589643), ('n07930864', 'cup', 0.07653624)]


In [45]:
#Prediction Class
np.argmax(prediction[0])

94

#NLP

## Question 1

Read file 'tweets.csv'

In [0]:
import pandas as pd


In [0]:
tweets_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Lab/Datasets/tweets.csv', encoding='mac_roman')

In [80]:
tweets_df.sample(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
5994,RT @mention Hey @mention why not roll a tracto...,iPad,Positive emotion
2291,@mention Brian - Actually Austin doesn't have ...,,No emotion toward brand or product
6112,RT @mention If you're in a room full of people...,,No emotion toward brand or product
7064,#iPad 2 Pop-up Store. #SXSW {link},,No emotion toward brand or product
8180,Me too. RT @mention Love it. #sxsw: &quot;appl...,Apple,Positive emotion
2220,"Marissa Mayer, VP of Search at Google is on li...",,No emotion toward brand or product
7313,"Gearing up to make a splash, @mention to Launc...",,No emotion toward brand or product
3073,Brutal question served up to Marissa Mayer abo...,,No emotion toward brand or product
2073,#bettersearch #sxsw Google Hotpot--looks like ...,,No emotion toward brand or product
5731,RT @mention Following the Bing/Google SEO sess...,,No emotion toward brand or product


In [81]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
tweet_text                                            9092 non-null object
emotion_in_tweet_is_directed_at                       3291 non-null object
is_there_an_emotion_directed_at_a_brand_or_product    9093 non-null object
dtypes: object(3)
memory usage: 213.2+ KB


**Drop null values**

*   Drop all the rows with null values




In [82]:
tweets_df.isna().any().describe()

count        3
unique       2
top       True
freq         2
dtype: object

In [83]:
tweets_df.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

**Print the dataframe**
*   print initial 5 rows of the data


In [0]:
tweets_df.dropna(inplace=True)

In [85]:
tweets_df.isna().any().describe()

count         3
unique        1
top       False
freq          3
dtype: object

In [86]:
tweets_df.isna().sum()

tweet_text                                            0
emotion_in_tweet_is_directed_at                       0
is_there_an_emotion_directed_at_a_brand_or_product    0
dtype: int64

In [87]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3291 entries, 0 to 9088
Data columns (total 3 columns):
tweet_text                                            3291 non-null object
emotion_in_tweet_is_directed_at                       3291 non-null object
is_there_an_emotion_directed_at_a_brand_or_product    3291 non-null object
dtypes: object(3)
memory usage: 102.8+ KB


In [88]:
tweets_df.sample(5)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
1588,Yay! RT â€œ@mention Flipboard's next platform ...,iPhone,Positive emotion
472,"Before It Even Begins, Apple Wins #SXSW {link}...",Apple,Positive emotion
4717,"&quot;I know I'm right, I own an Apple&quot; #...",Apple,Positive emotion
5815,RT @mention Google denies it will launch major...,Other Google product or service,Positive emotion
4737,In the Consumerist's &quot;Worst Company in Am...,Apple,Negative emotion


##Question 2

**Preprocess data**


*   convert all text to lowercase - use .lower()
*   select only numbers, alphabets, and #+_ from text - use re.sub()
*   strip all the text - use .strip() [To remove extra spaces]

In [0]:
import re

In [0]:
tweets_df['tweet_text'] = tweets_df['tweet_text'].apply(lambda s: re.sub('[^0-9a-z #+_]','',s))
tweets_df['tweet_text'] = tweets_df['tweet_text'].apply(lambda s: s.lower())

In [0]:
tweets_df['tweet_text'] = tweets_df['tweet_text'].str.strip()

## Question 3

**Preprocess data**


*   in column "is_there_an_emotion_directed_at_a_brand_or_product"
select only those rows where value equal to "positive emotion" or "negative emotion"
*   find the value counts of "positive emotion" and "negative emotion"





In [92]:
tweets_df['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [94]:
tweets_df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion                      2672
Negative emotion                       519
No emotion toward brand or product      91
I can't tell                             9
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [113]:
df_positive_negative = tweets_df[(tweets_df['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Positive emotion') |
          (tweets_df['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Negative emotion')]

df_positive_negative.head(5)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,wesley83 have a 3 ihone fter 3 hrs tweeting a...,iPhone,Negative emotion
1,jessedee now about fludapp wesome iadihone ap...,iPad or iPhone App,Positive emotion
2,swonderlin an not wait for #iad 2 also hey sho...,iPad,Positive emotion
3,sxsw hope this years festival isnt as crashy ...,iPad or iPhone App,Negative emotion
4,sxtxstate great stuff on ri # arissa ayer oogl...,Google,Positive emotion


In [114]:
df_positive_negative['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion    2672
Negative emotion     519
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

##Question 4

### Encode labels
- in column "is_there_an_emotion_directed_at_a_brand_or_product"
    - change "positive emotion" to 1
    - change "negative emotion" to 0
- use map function to replace values

In [0]:
# from sklearn.preprocessing import LabelEncoder
# LE = LabelEncoder()
# df_positive_negative['is_there_an_emotion_directed_at_a_brand_or_product'] = LE.fit_transform(df_positive_negative['is_there_an_emotion_directed_at_a_brand_or_product'])

In [120]:
df_positive_negative['is_there_an_emotion_directed_at_a_brand_or_product'] = df_positive_negative['is_there_an_emotion_directed_at_a_brand_or_product'].map({
    'Positive emotion': 1, 
    'Negative emotion': 0
    })

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [121]:
df_positive_negative.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,wesley83 have a 3 ihone fter 3 hrs tweeting a...,iPhone,0
1,jessedee now about fludapp wesome iadihone ap...,iPad or iPhone App,1
2,swonderlin an not wait for #iad 2 also hey sho...,iPad,1
3,sxsw hope this years festival isnt as crashy ...,iPad or iPhone App,0
4,sxtxstate great stuff on ri # arissa ayer oogl...,Google,1


## Question 5

### Get feature and label
- get column "tweet_text" as feature
- get column "is_there_an_emotion_directed_at_a_brand_or_product" as label

In [0]:
# feature
X = df_positive_negative['tweet_text']

# target
y = df_positive_negative['is_there_an_emotion_directed_at_a_brand_or_product']

### Create train and test data
- use train_test_split to get train and test set
- set a random_state
- test_size: 0.25

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

## Question 6

### Vectorize data
- create document-term matrix
- use CountVectorizer()
    - ngram_range: (1, 2)
    - stop_words: 'english'
    - min_df: 2   
- do fit_transform on X_train
- do transform on X_test

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer(ngram_range=(1,2), stop_words='english', min_df=2)

In [0]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [0]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [133]:
# examine the document-term matrix
X_train_dtm

<2393x5489 sparse matrix of type '<class 'numpy.int64'>'
	with 33095 stored elements in Compressed Sparse Row format>

In [134]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<798x5489 sparse matrix of type '<class 'numpy.int64'>'
	with 9131 stored elements in Compressed Sparse Row format>

## Question 7

### Select classifier logistic regression
- use logistic regression for predicting sentiment of the given tweet
- initialize classifier

In [0]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

### Fit the classifer
- fit logistic regression classifier

In [157]:
# train the model using X_train_dtm
logreg.fit(X_train_dtm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Question 8

### Select classifier naive bayes
- use naive bayes for predicting sentiment of the given tweet
- initialize classifier
- use MultinomialNB

In [0]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

### Fit the classifer
- fit naive bayes classifier

In [142]:
# train the model using X_train_dtm
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Question 9

### Make predictions on logistic regression
- use your trained logistic regression model to make predictions on X_test

In [0]:
# make class predictions for X_test_dtm
y_pred_class_log = logreg.predict(X_test_dtm)

### Make predictions on naive bayes
- use your trained naive bayes model to make predictions on X_test
- use a different variable name to store predictions so that they are kept separately

In [0]:
# make class predictions for X_test_dtm
y_pred_class_naive = nb.predict(X_test_dtm)

## Question 10

### Calculate accuracy of logistic regression
- check accuracy of logistic regression classifer
- use sklearn.metrics.accuracy_score

In [0]:
# calculate accuracy of class predictions
from sklearn import metrics

In [159]:
# Score logistic model
metrics.accuracy_score(y_test, y_pred_class_log)

0.8734335839598998

### Calculate accuracy of naive bayes
- check accuracy of naive bayes classifer
- use sklearn.metrics.accuracy_score

In [147]:
# Score naive model
metrics.accuracy_score(y_test, y_pred_class_naive)

0.8784461152882206

In [155]:
# Trying some experiment with logistic model
logreg_weighted = LogisticRegression(class_weight={
    0: 5, 1: 1
})

logreg_weighted.fit(X_train_dtm, y_train)

# make class predictions for X_test_dtm
y_pred_class_log_weighted = logreg_weighted.predict(X_test_dtm)

# Score logistic weighted model
metrics.accuracy_score(y_test, y_pred_class_log_weighted)

0.849624060150376

In [162]:
print(metrics.classification_report(y_test, y_pred_class_log_weighted))

              precision    recall  f1-score   support

           0       0.54      0.58      0.56       130
           1       0.92      0.90      0.91       668

    accuracy                           0.85       798
   macro avg       0.73      0.74      0.73       798
weighted avg       0.85      0.85      0.85       798



In [161]:
print(metrics.classification_report(y_test, y_pred_class_log))

              precision    recall  f1-score   support

           0       0.77      0.32      0.45       130
           1       0.88      0.98      0.93       668

    accuracy                           0.87       798
   macro avg       0.83      0.65      0.69       798
weighted avg       0.86      0.87      0.85       798



Observation from two different verion of logistic regression:
- Simple logistic model was performing low on the negative values
- after providing the weights to class having less number of records model starts predicting the negative values more better