Mounting Colab on Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing necessary Libraries

In [None]:
import io
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

Reading data from cvs file into a list

In [None]:
data= []
with open("/content/drive/MyDrive/English_Dataset.csv", 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        data.append(row)
random.seed(0)
random.shuffle(data)
print(data[0])       

["One of the main ways that Facebook addicts users is games, and now Google+ is following Facebook into gaming.\n\nGoogle began introducing games on its social network Thursday, and while it is starting with just a few options, they include big-name games like Angry Birds from Rovio, Bejeweled Blitz from PopCap Games and, most notably, Zynga Poker.\n\nZynga has built the vast majority of its business on Facebook, and that business is thriving - Zynga has filed to go public and says it earned $90 million in profit on sales of $597 million last year. But analysts have also criticized it for relying too much on one platform.\n\nThat is about to change. Google+ has been growing remarkably quickly, and already it rivals existing social networks. Though it does not have Zynga's most well-known games, FarmVille and Mafia Wars, the two companies already have a partnership because Google has invested in Zynga.\n\nGoogle+ users will see a Games page at the top of their news feeds and can click o

Parsing data from the list and convert upper case letters to lower case

In [None]:
sentences = []
labels = []
for i in range(len(data)):
  s = data[i][0]
  sentence = " ".join(s.split())
  sentence = sentence.lower()
  sentences.append(sentence)
  labels.append(data[i][1])
print(sentences[0])
print("\n\n the label of this sentence is: ", labels[0])

one of the main ways that facebook addicts users is games, and now google+ is following facebook into gaming. google began introducing games on its social network thursday, and while it is starting with just a few options, they include big-name games like angry birds from rovio, bejeweled blitz from popcap games and, most notably, zynga poker. zynga has built the vast majority of its business on facebook, and that business is thriving - zynga has filed to go public and says it earned $90 million in profit on sales of $597 million last year. but analysts have also criticized it for relying too much on one platform. that is about to change. google+ has been growing remarkably quickly, and already it rivals existing social networks. though it does not have zynga's most well-known games, farmville and mafia wars, the two companies already have a partnership because google has invested in zynga. google+ users will see a games page at the top of their news feeds and can click on it to play g

**Separating train/test data**

In [None]:
split_size = 0.8
train_size = int(split_size*len(sentences))

# Split the sentences and labels into train/validation splits
train_sentences = sentences[0:train_size]
train_labels = labels[0:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

**Defining global parameters**

In [None]:
#The maximum number of words to keep
NUM_WORDS = 1000
#Dimension of the dense embedding
EMBEDDING_DIM = 16
#Maximum length of all sequences
MAXLEN = 120
#Padding strategy
PADDING = 'post'
#Token to replace non vocab words
NON_VOCAB = "<>"

**Tokenizing data**

In [None]:
tokenizer = Tokenizer(num_words = NUM_WORDS, oov_token = NON_VOCAB)
tokenizer.fit_on_texts(train_sentences)


**Creating Padded Sequences**

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded_sequences = pad_sequences(train_sequences, maxlen = MAXLEN, padding = PADDING)

validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded_sequences = pad_sequences(validation_sequences, maxlen = MAXLEN, padding = PADDING)

In [None]:
print(train_padded_sequences.shape)

(1739, 120)


**Converting labels to numpy array**

In [None]:
t_labels = [float(i) for i in train_labels]
t_labels = np.array(t_labels)
#t_labels = np.reshape(t_labels, (len(train_labels), 1))
print(t_labels)
print(t_labels.shape)

v_labels = [float(i) for i in validation_labels]
v_labels = np.array(v_labels)
#v_labels = np.reshape(v_labels, (len(validation_labels), 1))

[1. 0. 0. ... 1. 1. 1.]
(1739,)


**Creating SVM classifier**

In [None]:
# Fitting SVM to the Training set
from sklearn.svm import SVC

classifier = SVC(kernel = 'linear', random_state = 0, verbose=True)
classifier.fit(train_padded_sequences, t_labels)

[LibSVM]

**Predicting the Test set results**

In [None]:
y_pred = classifier.predict(validation_padded_sequences)

**Making the Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(v_labels, y_pred)

In [None]:
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = train_padded_sequences, t_labels
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('blue', 'black'))(i), label = j)
plt.title('SVM (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [None]:
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = validation_padded_sequences, v_labels
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('blue', 'black')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('blue', 'black'))(i), label = j)
plt.title('SVM (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()