In [None]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('dataset_2.csv')

Happy, Sad, Angry, Fear, Normal, Suprised

In [None]:
df.head()

Unnamed: 0,Sentence,Emotion
0,"""It is very unpleasant, I am afraid of the pol...",3.0
1,"Pickles nearly had a fit, he barked and he bar...",2.0
2,He shut the door in Nutkin's face.,2.0
3,Old Mr. Brown turned up his eyes in disgust at...,2.0
4,"And to this day, if you meet Nutkin up a tree ...",2.0


**DATA PREPROCESSING**

In [None]:
df.isnull().sum()

Sentence      0
Emotion     523
dtype: int64

In [None]:
df.fillna(1, inplace=True)

In [None]:
df.isnull().sum()

Sentence    0
Emotion     0
dtype: int64

In [None]:
df.shape

(2011, 2)

In [None]:
df['Emotion'].value_counts()

4.0    587
1.0    523
6.0    319
2.0    243
3.0    187
7.0    152
Name: Emotion, dtype: int64

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

In [None]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenized = df['Sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(2011, 116)

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2011, 116)

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ftf = last_hidden_states[0].numpy()

In [None]:
print(ftf.shape)

(2011, 116, 768)


In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
print(features.shape)
print(features[0].shape)
features

(2011, 768)
(768,)


array([[ 0.1915593 ,  0.2802196 , -0.07284648, ...,  0.28744635,
         0.63377905,  0.59608954],
       [ 0.06012429,  0.0102344 , -0.3863501 , ..., -0.35002953,
         0.6891032 ,  0.32826984],
       [-0.13783091,  0.54226774, -0.22812462, ..., -0.18520956,
         0.4878332 ,  0.54929984],
       ...,
       [-0.00589724, -0.06791338, -0.42092624, ..., -0.44061896,
         0.50576574,  0.03719684],
       [ 0.183854  ,  0.08476199, -0.13364014, ..., -0.37431908,
         0.7551605 ,  0.12780835],
       [ 0.10534973,  0.1682531 ,  0.25186956, ..., -0.5198934 ,
         0.7853205 ,  0.11401182]], dtype=float32)

In [None]:
labels = df['Emotion']

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
# parameters = {'C': np.linspace(0.0001, 100, 20)}
# grid_search = GridSearchCV(LogisticRegression(), parameters)
# grid_search.fit(train_features, train_labels)

# print('best parameters: ', grid_search.best_params_)
# print('best scrores: ', grid_search.best_score_)

In [None]:
lr_clf = LogisticRegression(C=5.263252631578947)

In [None]:
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=5.263252631578947)

In [None]:
lr_clf.score(test_features, test_labels)

0.6620278330019881

In [None]:
predictions = lr_clf.predict(test_features)

In [None]:
predictions

array([2., 4., 4., 1., 4., 7., 3., 1., 7., 2., 4., 3., 1., 2., 4., 1., 1.,
       1., 2., 2., 2., 6., 4., 2., 3., 1., 1., 4., 2., 6., 6., 7., 1., 6.,
       4., 6., 6., 2., 6., 4., 4., 4., 4., 1., 2., 1., 7., 1., 1., 4., 4.,
       1., 4., 4., 4., 1., 4., 4., 7., 1., 2., 3., 7., 1., 4., 2., 6., 4.,
       2., 1., 3., 4., 4., 7., 4., 6., 4., 4., 4., 6., 4., 2., 2., 1., 6.,
       4., 1., 4., 2., 4., 4., 1., 6., 4., 6., 2., 6., 3., 1., 1., 1., 1.,
       1., 4., 4., 6., 4., 1., 6., 4., 1., 1., 1., 4., 2., 1., 6., 6., 4.,
       1., 7., 4., 2., 6., 4., 4., 1., 1., 2., 7., 1., 4., 2., 4., 1., 1.,
       6., 4., 6., 1., 6., 4., 7., 6., 4., 7., 2., 1., 7., 4., 1., 4., 6.,
       1., 3., 2., 3., 3., 4., 4., 6., 1., 7., 2., 2., 1., 3., 7., 1., 2.,
       2., 6., 6., 4., 4., 6., 1., 4., 1., 1., 3., 4., 6., 3., 1., 4., 6.,
       4., 6., 6., 4., 3., 4., 6., 1., 1., 6., 2., 6., 6., 1., 2., 4., 4.,
       6., 2., 1., 1., 6., 1., 6., 3., 6., 4., 6., 4., 4., 1., 4., 4., 6.,
       1., 1., 6., 4., 4.

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))

0.6620278330019881


In [None]:
def transform(df):
  tokenized = df['Sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
  max_len = 0
  for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

  padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
  attention_mask = np.where(padded != 0, 1, 0)
  input_ids = torch.tensor(padded)  
  attention_mask = torch.tensor(attention_mask)

  with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
  features = last_hidden_states[0][:,0,:].numpy()
  return features

In [None]:
import pickle
with open('lr_clf_model', 'wb') as files:
    pickle.dump(lr_clf, files)

In [None]:
import pickle
with open('/content/drive/MyDrive/ED/Pretrained models/lr_clf_model' , 'rb') as f:
    lr_clf_2 = pickle.load(f)

In [None]:
embed_array = ['__', 'normal_embed', 'angry_embed', 'fear_embed', 'happy_embed', '__', 'sad_embed', 'suprise_embed']

In [None]:
%tensorflow_version 2.x
import os
from os.path import exists, join, basename, splitext
git_repo_url = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q --recursive {git_repo_url}
  # install dependencies
  !cd {project_name} && pip install -q -r requirements.txt
  !pip install -q gdown
  !apt-get install -qq libportaudio2
  !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

import sys
sys.path.append(project_name)

from IPython.display import display, Audio, clear_output
from IPython.utils import io
import ipywidgets as widgets
import numpy as np
from dl_colab_notebooks.audio import record_audio, upload_audio

from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path

encoder.load_model(Path("/content/drive/MyDrive/ED/Pretrained models/encoder.pt"))
synthesizer = Synthesizer(Path("/content/drive/MyDrive/ED/Pretrained models/synthesizer.pt"))
vocoder.load_model(Path("/content/drive/MyDrive/ED/Pretrained models/vocoder.pt"))

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
Loaded encoder "encoder.pt" trained to step 1564501
Synthesizer using device: cpu
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at /content/drive/MyDrive/ED/Pretrained models/vocoder.pt


In [None]:
def synthesize(embed, text):
  print("Synthesizing new audio...")
  #with io.capture_output() as captured:
  specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  clear_output()
  display(Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True))


In [None]:
Sentence_list = ['''I AM happy but climate is bad''']
values = {'Sentence':  Sentence_list}
test_data = pd.DataFrame(values)
test_feature = transform(test_data)

In [None]:
prediction = lr_clf_2.predict(test_feature)

In [None]:
prediction

array([3.])

In [None]:
embed = Path("/content/drive/MyDrive/ED/Pretrained models/" + embed_array[int(prediction[0])])

In [None]:
with open(embed , 'rb') as f:
    embedding_emo = pickle.load(f)

In [None]:
synthesize(embedding_emo, Sentence_list[0])

In [None]:
print("================= SVM =================")
from sklearn import svm

#Create a svm Classifier
lr_clf = svm.SVC(kernel='linear') # Linear Kernel
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)
predictions = lr_clf.predict(test_features)
print(predictions)
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))




[7. 2. 4. 1. 4. 7. 3. 1. 7. 7. 4. 6. 1. 2. 4. 1. 1. 1. 2. 2. 2. 6. 4. 2.
 3. 1. 1. 4. 2. 2. 6. 4. 1. 6. 4. 6. 6. 2. 6. 4. 4. 4. 4. 1. 2. 1. 7. 1.
 1. 4. 4. 1. 4. 1. 4. 1. 4. 4. 7. 1. 2. 7. 7. 4. 4. 2. 6. 4. 2. 1. 3. 7.
 4. 7. 4. 2. 4. 4. 4. 6. 4. 2. 2. 1. 2. 4. 1. 6. 2. 4. 4. 1. 6. 4. 6. 3.
 6. 3. 1. 1. 1. 1. 1. 2. 4. 6. 4. 1. 6. 4. 1. 1. 1. 4. 2. 1. 2. 6. 4. 1.
 7. 4. 2. 6. 4. 3. 1. 1. 2. 1. 1. 4. 4. 4. 1. 1. 6. 4. 6. 1. 6. 4. 7. 6.
 1. 7. 2. 1. 3. 4. 1. 4. 1. 1. 2. 2. 1. 3. 4. 1. 6. 1. 7. 1. 2. 1. 3. 7.
 6. 2. 2. 6. 6. 4. 4. 4. 1. 4. 1. 1. 3. 4. 6. 1. 1. 4. 6. 4. 6. 4. 4. 3.
 4. 6. 1. 1. 1. 2. 1. 1. 4. 2. 4. 4. 3. 1. 1. 2. 6. 1. 6. 3. 7. 4. 4. 4.
 4. 1. 4. 4. 6. 1. 1. 4. 4. 4. 3. 4. 4. 2. 1. 6. 1. 6. 6. 1. 1. 1. 3. 1.
 4. 3. 4. 1. 1. 6. 3. 3. 1. 1. 2. 7. 4. 1. 1. 4. 7. 7. 1. 2. 1. 2. 2. 6.
 1. 1. 6. 1. 1. 1. 4. 3. 4. 4. 1. 4. 1. 1. 4. 6. 4. 7. 4. 1. 2. 1. 1. 1.
 1. 6. 3. 4. 1. 6. 4. 4. 4. 1. 4. 4. 3. 3. 4. 4. 3. 4. 2. 1. 4. 1. 6. 4.
 4. 3. 4. 4. 6. 6. 6. 4. 1. 1. 1. 6. 4. 4. 4. 1. 2.

In [None]:
print("=================RandomForestClassifier =================")
from sklearn.ensemble import RandomForestClassifier
lr_clf = RandomForestClassifier(n_estimators=50, criterion="entropy")
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)
predictions = lr_clf.predict(test_features)
print(predictions)
print(accuracy_score(test_labels, predictions))

[6. 4. 4. 1. 1. 1. 3. 1. 1. 2. 4. 6. 1. 3. 4. 4. 1. 1. 1. 2. 3. 6. 6. 2.
 4. 1. 1. 1. 6. 4. 6. 1. 1. 6. 4. 6. 1. 2. 4. 4. 4. 4. 1. 1. 4. 1. 4. 1.
 1. 1. 4. 1. 4. 4. 4. 1. 4. 4. 4. 1. 1. 6. 2. 1. 4. 4. 4. 4. 4. 1. 3. 4.
 4. 4. 4. 6. 4. 4. 4. 4. 4. 4. 2. 1. 6. 4. 1. 4. 4. 4. 6. 1. 4. 4. 6. 4.
 6. 4. 1. 1. 1. 1. 1. 4. 1. 1. 4. 1. 6. 4. 1. 1. 1. 1. 2. 1. 6. 6. 4. 1.
 4. 4. 4. 6. 4. 4. 1. 1. 4. 1. 1. 4. 2. 4. 1. 1. 4. 1. 6. 1. 4. 4. 1. 4.
 1. 1. 4. 1. 1. 4. 1. 4. 1. 1. 6. 1. 1. 1. 4. 4. 6. 1. 1. 1. 6. 1. 4. 1.
 1. 2. 2. 1. 6. 1. 4. 4. 1. 4. 1. 1. 6. 4. 4. 1. 1. 4. 4. 4. 6. 4. 4. 2.
 4. 4. 1. 1. 1. 4. 1. 4. 1. 2. 4. 4. 2. 1. 1. 1. 4. 1. 6. 4. 6. 4. 1. 4.
 4. 1. 4. 4. 6. 1. 1. 6. 4. 4. 4. 4. 4. 1. 1. 6. 1. 1. 6. 4. 1. 1. 4. 1.
 6. 4. 4. 6. 1. 1. 4. 3. 1. 1. 2. 2. 4. 1. 4. 1. 2. 4. 1. 4. 1. 2. 2. 6.
 1. 2. 4. 1. 1. 1. 4. 1. 4. 4. 1. 4. 1. 1. 4. 6. 4. 1. 4. 1. 4. 1. 1. 1.
 1. 6. 4. 4. 1. 6. 1. 4. 4. 1. 4. 4. 4. 1. 4. 4. 4. 4. 2. 1. 4. 1. 4. 1.
 4. 4. 4. 6. 6. 4. 4. 1. 1. 1. 1. 6. 4. 4. 4. 1. 2.