In [None]:
import pandas as pd
import numpy as np

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# **Reading the csv**

In [None]:
data= pd.read_csv("/content/tweet_emotions.csv")
print(data)

         tweet_id  ...                                            content
0      1956967341  ...  @tiffanylue i know  i was listenin to bad habi...
1      1956967666  ...  Layin n bed with a headache  ughhhh...waitin o...
2      1956967696  ...                Funeral ceremony...gloomy friday...
3      1956967789  ...               wants to hang out with friends SOON!
4      1956968416  ...  @dannycastillo We want to trade with someone w...
...           ...  ...                                                ...
39995  1753918954  ...                                   @JohnLloydTaylor
39996  1753919001  ...                     Happy Mothers Day  All my love
39997  1753919005  ...  Happy Mother's Day to all the mommies out ther...
39998  1753919043  ...  @niariley WASSUP BEAUTIFUL!!! FOLLOW ME!!  PEE...
39999  1753919049  ...  @mopedronin bullet train from tokyo    the gf ...

[40000 rows x 3 columns]


In [None]:
data.columns

Index(['tweet_id', 'sentiment', 'content'], dtype='object')

In [None]:
y_total = list(data.sentiment)
print((set(y_total)))

{'anger', 'fun', 'neutral', 'boredom', 'worry', 'surprise', 'relief', 'love', 'hate', 'empty', 'happiness', 'sadness', 'enthusiasm'}


In [None]:

print(len(data[data.sentiment == 'surprise']))
print(len(data[data.sentiment == 'neutral']))
print(len(data[data.sentiment == 'happiness']))
print(len(data[data.sentiment == 'relief']))
print(len(data[data.sentiment == 'sadness']))
print(len(data[data.sentiment == 'worry']))
print(len(data[data.sentiment == 'fun']))
print(len(data[data.sentiment == 'empty']))
print(len(data[data.sentiment == 'anger']))
print(len(data[data.sentiment == 'love']))
print(len(data[data.sentiment == 'boredom']))
print(len(data[data.sentiment == 'enthusiasm']))
print(len(data[data.sentiment == 'hate']))

2187
8638
5209
1526
5165
8459
1776
827
110
3842
179
759
1323


In [None]:
# Dropping rows with other emotion labels
data = data.drop(data[data.sentiment == 'anger'].index)
data = data.drop(data[data.sentiment == 'enthusiasm'].index)
data = data.drop(data[data.sentiment == 'boredom'].index)
data = data.drop(data[data.sentiment == 'empty'].index)
data = data.drop(data[data.sentiment == 'hate'].index)

In [None]:
data['sentiment'] = data['sentiment'].replace({'anger': 'negative'})
data['sentiment'] = data['sentiment'].replace({'fun': 'positive'})
data['sentiment'] = data['sentiment'].replace({'boredom': 'negative'})
data['sentiment'] = data['sentiment'].replace({'worry': 'negative'})
data['sentiment'] = data['sentiment'].replace({'surprise': 'positive'})
data['sentiment'] = data['sentiment'].replace({'relief': 'positive'})
data['sentiment'] = data['sentiment'].replace({'love': 'positive'})
data['sentiment'] = data['sentiment'].replace({'hate': 'negative'})
data['sentiment'] = data['sentiment'].replace({'empty': 'neutral'})
data['sentiment'] = data['sentiment'].replace({'happiness': 'positive'})
data['sentiment'] = data['sentiment'].replace({'sadness': 'negative'})
data['sentiment'] = data['sentiment'].replace({'enthusiasm': 'neutral'})

  
# # writing into the file
# df.to_csv("AllDetails.csv", index=False)
  
y_total = list(data.sentiment)
print((set(y_total)))

{'neutral', 'positive', 'negative'}


# **Converting text to lowercase**

In [None]:
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
print(data['content'])

1        layin n bed with a headache ughhhh...waitin on...
2                      funeral ceremony...gloomy friday...
4        @dannycastillo we want to trade with someone w...
5        re-pinging @ghostridah14: why didn't you go to...
6        i should be sleep, but im not! thinking about ...
                               ...                        
39995                                     @johnlloydtaylor
39996                        happy mothers day all my love
39997    happy mother's day to all the mommies out ther...
39998    @niariley wassup beautiful!!! follow me!! peep...
39999    @mopedronin bullet train from tokyo the gf and...
Name: content, Length: 36802, dtype: object


# **Removing punctuations and symbols**

In [None]:
import string
data['content'] = data['content'].apply(lambda x: " ".join(x.translate(str.maketrans('','',string.punctuation)) for x in x.split()))

In [None]:
print(data['content'])

1        layin n bed with a headache ughhhhwaitin on yo...
2                            funeral ceremonygloomy friday
4        dannycastillo we want to trade with someone wh...
5        repinging ghostridah14 why didnt you go to pro...
6        i should be sleep but im not thinking about an...
                               ...                        
39995                                      johnlloydtaylor
39996                        happy mothers day all my love
39997    happy mothers day to all the mommies out there...
39998    niariley wassup beautiful follow me peep out m...
39999    mopedronin bullet train from tokyo the gf and ...
Name: content, Length: 36802, dtype: object


# **Removing stop words**

In [None]:
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
              "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
              "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
              "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
              "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
              "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
              "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
              "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
              "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
              "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [None]:
print(data['content'])

1                   layin n bed headache ughhhhwaitin call
2                            funeral ceremonygloomy friday
4        dannycastillo want trade someone houston ticke...
5        repinging ghostridah14 didnt go prom bc bf did...
6        sleep im thinking old friend want hes married ...
                               ...                        
39995                                      johnlloydtaylor
39996                               happy mothers day love
39997    happy mothers day mommies woman man long youre...
39998    niariley wassup beautiful follow peep new hit ...
39999    mopedronin bullet train tokyo gf visiting japa...
Name: content, Length: 36802, dtype: object


# **Lemmatisation**

In [None]:
from textblob import Word

data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [None]:
print(data['content'])

1                   layin n bed headache ughhhhwaitin call
2                            funeral ceremonygloomy friday
4        dannycastillo want trade someone houston ticke...
5        repinging ghostridah14 didnt go prom bc bf did...
6        sleep im thinking old friend want he married d...
                               ...                        
39995                                      johnlloydtaylor
39996                                happy mother day love
39997    happy mother day mommy woman man long youre mo...
39998    niariley wassup beautiful follow peep new hit ...
39999    mopedronin bullet train tokyo gf visiting japa...
Name: content, Length: 36802, dtype: object


# **Stemming**

In [None]:
ps = PorterStemmer()
 
data['content'] = data['content'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))

In [None]:
print(data['content'])

1                    layin n bed headach ughhhhwaitin call
2                              funer ceremonygloomi friday
4        dannycastillo want trade someon houston ticket...
5        reping ghostridah14 didnt go prom bc bf didnt ...
6        sleep im think old friend want he marri damn a...
                               ...                        
39995                                      johnlloydtaylor
39996                                happi mother day love
39997    happi mother day mommi woman man long your mom...
39998    niariley wassup beauti follow peep new hit sin...
39999    mopedronin bullet train tokyo gf visit japan s...
Name: content, Length: 36802, dtype: object


# **Removing 10,000 rare words**

In [None]:
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

In [None]:
freq = list(freq.index)
print(freq)

['slurpe', 'phoneit', 'zekemurphi', 'stooopit', 'rgambarini', 'goofin', 'greencapt', 'himquot', 'mattmarquess', 'eeeperschoic', 'weidl', 'wantplz', 'krissylin', 'wholl', 'matthalveland', 'alanajoy', 'quoteat', 'cairn', 'pushit', 'poorest', 'somepleas', 'wrongand', 'httptwitpiccom67318', 'indiepixi', 'andytaylorson', 'sesion', 'usquotwhat', 'littlewel', 'vieriu', 'ryland', 'httptwitpiccom4wn29', 'paulamackay', 'werecat1', 'minequot', 'wrigley', 'woodywoodford', 'trackk', 'acousticalt', 'zoita', 'bhaaji', 'janey', 'humira', 'tapeworm', 'tinocochino', 'ilsedelang', 'amytcathi', 'cokebabyy', 'tci', 'emzo2k9', 'outnow', 'tantrum', 'mercenari', 'parikh', 'wolfkitten', 'cookoff', 'selfvalid', 'everblu', 'churchsunday', 'gard', 'yd', 'minneapolisst', 'serendip', 'overlap', 'pkuer', 'atravï¿½', 'niiight', 'nttn', 'befibeez', 'caaaaant', 'purrtti', 'girland', 'guyquot', 'manthigh81', 'amandagelso', 'hahahaa', 'xl', 'quotyummyquot', 'quickpwn', 'basment', 'sikaflex', 'ambitiousmurphi', 'worldtrav

In [None]:
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [None]:
print(data['content'])

1                    layin n bed headach ughhhhwaitin call
2                              funer ceremonygloomi friday
4                     want trade someon houston ticket one
5        reping ghostridah14 didnt go prom bc bf didnt ...
6        sleep im think old friend want he marri damn a...
                               ...                        
39995                                      johnlloydtaylor
39996                                happi mother day love
39997    happi mother day mommi woman man long your mom...
39998    niariley wassup beauti follow peep new hit sin...
39999    mopedronin bullet train tokyo gf visit japan s...
Name: content, Length: 36802, dtype: object


# **Label encoding**

In [None]:
label_encoding = preprocessing.LabelEncoder()
y = label_encoding.fit_transform(data.sentiment.values)

In [None]:
for i in range(len(y)):
  if y[i] == 0:
    print(y[i],'-',data.sentiment.values[i])
    break
for i in range(len(y)):
  if y[i] == 1:
    print(y[i],'-',data.sentiment.values[i])
    break
for i in range(len(y)):
  if y[i] == 2:
    print(y[i],'-',data.sentiment.values[i])
    break

0 - negative
1 - neutral
2 - positive


# **Train test split**

In [None]:
Test_size = 3000

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

In [None]:

X_train_ = X_train[:Test_size]
y_train_ = y_train[:Test_size]

X_test = np.append(X_val, X_train_)
y_test = np.append(y_val, y_train_)


# **Extracting TF-IDF parameters**

In [None]:
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_test)

# **Extracting Count Vectors Parameters**

In [None]:
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_test)

# **Building models using count vectors feature**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Model 1: Multinomial Naive Bayes Classifier
print('Using TF-IDF as feature vector, accuracy for')
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes = %s' % accuracy_score(y_pred, y_test))


# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('Linear SVM = %s' % accuracy_score(y_pred, y_test))

# Model 3: logistic regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('logistic regression = %s' % accuracy_score(y_pred, y_test))


# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_val_tfidf)
print('random forest = %s' % accuracy_score(y_pred, y_test))


Using TF-IDF as feature vector, accuracy for
naive bayes = 0.40428079628798086
Linear SVM = 0.41206406226612785


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


logistic regression = 0.39589881754228406
random forest = 0.3955994611585092


# **Building models using count vectors feature**

In [None]:

print('Using Count Vectors as feature vector, accuracy for')
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes = accuracy %s' % accuracy_score(y_pred, y_test))


# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('Linear SVM = accuracy %s' % accuracy_score(y_pred, y_test))


# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('logistic regression = accuracy %s' % accuracy_score(y_pred, y_test))


Using Count Vectors as feature vector, accuracy for
naive bayes = accuracy 0.6561891932345457
Linear SVM = accuracy 0.6301451878461308
logistic regression = accuracy 0.7088759167789254


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
di = {0:"negative",1:"neutral",2:"positive"}

# **Demo for some random statements**

In [None]:
#Below are some random statements. The first 4 depict happiness. The last 4 depict sadness
#0 - negative
#1 - neutral
#2 - positive
tweets = pd.DataFrame(['This is a nice project', 
                       '', 
                       'There will be coffee, sweets and dance', 
                       'Oh, the chocolate will be wonderful', 
                       'I am so sad as I am missing home', 
                       'At home alone with not much to do',
                       'I want to enjoy with my friends',
                       'It will be a lot more fun in college',
                       'I will kill you',
                       'I am positive',
                       'load beard papa disappear uk',
                       'I will fail the test'])

# Doing some preprocessing on these tweets as done before
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])

#Predicting the emotion of the tweet using our already trained logistic regression
tweet_pred = logreg.predict(tweet_count)
print(tweet_pred)


[2 1 2 0 0 1 2 2 0 1 1 0]


# **Evaluation metrics**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print("CONFUSION MATRIX - ")
print(confusion_matrix(y_pred, y_test))
print()
print("ACCURACY SCORE","-",accuracy_score(y_pred, y_test))
print()
print("CLASSIFICATION REPORT"," - ")
print(classification_report(y_pred, y_test))

CONFUSION MATRIX - 
[[1848  324  362]
 [ 257  867  255]
 [ 380  367 2021]]

ACCURACY SCORE - 0.7088759167789254

CLASSIFICATION REPORT  - 
              precision    recall  f1-score   support

           0       0.74      0.73      0.74      2534
           1       0.56      0.63      0.59      1379
           2       0.77      0.73      0.75      2768

    accuracy                           0.71      6681
   macro avg       0.69      0.70      0.69      6681
weighted avg       0.71      0.71      0.71      6681



# **Speech to text**

In [None]:
!pip install pydub
!pip install SpeechRecognition
import speech_recognition as sr
import os
from pydub import AudioSegment

def speech_to_text(path):
  r = sr.Recognizer()

  sound = AudioSegment.from_ogg(path)
  sound.export("/content/live_demo.wav", format="wav")
  path = path[:-3] + "wav"
  with sr.AudioFile(path) as source:
    audio = r.listen(source)
  
  try:
    print(r.recognize_google(audio))
  except:
    pass



In [None]:
voice_1 = speech_to_text("/content/live_demo.wav")

this is a nice project


# **Speech emotion detection**

In [None]:
tweets = pd.DataFrame([str(voice_1)])
# Doing some preprocessing on these tweets as done before
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])

#Predicting the emotion of the tweet using our already trained linear SVM
tweet_pred = logreg.predict(tweet_count)
print(tweet_pred[0],'-',di[tweet_pred[0]])


2 - positive
