In [1]:
# Text Mining Project

#### Spring 2022
#### Opinion Mining Evaluation Forum

#### Students m20200246 Pedro Costa & m2019XXXX Ana Bernardes

In [2]:
### Data import

In [3]:
# Loading file 
import pandas as pd
#df = pd.read_csv('training_set.txt', delimiter = '\t')
df = pd.read_csv('dev_set.txt', delimiter = '\t')

In [4]:
len(df)

1000

In [5]:
df.head()

Unnamed: 0,sentence,emotion
0,What happens to the gold in our safe ?,4
1,Natural to get cold feet .,8
2,"Not very lucky , is he ?",7
3,I'm just a little anxious to get up there and ...,2
4,Did you think we don't know about your affair ...,1


In [6]:
df = pd.DataFrame(data=df, columns=['sentence', 'emotion'])

In [7]:
df[100:400]

Unnamed: 0,sentence,emotion
100,We've got to get out of here .,4
101,I could have you shot .,1
102,"As a so called one percenter motorcycle club ,...",3
103,"With her , it would be marriage or nothing , a...",1
104,"Your mom wasn't ... well , and she needed time...",6
...,...,...
395,"Judge , you said we would be entitled to some ...",8
396,"Come on , Dad .",2
397,I can't do this .,4
398,We haven't anything to discuss with you .,1


In [8]:
# Check for empty cells

"""""
Avaliar a necessidade de filtrar sentences vazias
"""""

df.isnull().sum()

sentence    0
emotion     0
dtype: int64

In [9]:
# Check emotion frequency
df.emotion.value_counts()

1    211
2    170
8    158
4    104
5     97
7     96
6     87
3     77
Name: emotion, dtype: int64

In [10]:
from tqdm import tqdm_notebook as tqdm
def label_counter(df, field):
    """
    ???????????
    """
    return df[field].value_counts()

In [11]:
label_counter(df, "emotion")

1    211
2    170
8    158
4    104
5     97
7     96
6     87
3     77
Name: emotion, dtype: int64

In [12]:
# Word Count
def word_count(text):
    """
    Function that counts text words
    """
    df_words = " ".join(text).split()
    # Count all words 
    freq = pd.Series(df_words).value_counts()
    return freq

In [13]:
word_count(list(df['sentence']))[:25]

.           734
,           380
I           238
you         234
to          223
?           211
the         187
a           160
!           120
[PERSON]    101
of          101
it           99
me           76
You          75
and          75
in           72
that         71
is           67
for          61
be           56
do           53
I'm          49
this         47
with         45
not          44
dtype: int64

In [14]:
#### Preprocessing

In [15]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from bs4 import BeautifulSoup

nltk.download('wordnet')
nltk.download('stopwords')

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
stem = SnowballStemmer("english")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\migue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\migue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def clean(text_in, lemmatize, stemmer):
    """
    Preprocessing Function  will consist of   
    - make everything lowercased
    - removing all symbols that are not letters
    - remove stopwords
    - replace words with the corresponding lemma

    """
    updates = []
    for j in tqdm(range(len(text_in))):
        
        text = text_in[j]
        
        #make everything lowercased
        text = text.lower()      
          
        #removing all symbols that are not letters
        text = re.sub("[^a-zA-Z]", ' ', text)
            
        #remove stopwords
        text = ' '.join([word for word in text.split() if word not in stop])
        
        #replace words with the corresponding lemma
        text = BeautifulSoup(text).get_text()
        
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        if stemmer:
            text = " ".join(stem.stemmer(word) for word in text.split())
        
        updates.append(text)
        
    return updates

In [17]:
def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"sentence": list_updated}))

In [18]:
updates = clean(df["sentence"], lemmatize = True, stemmer = False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for j in tqdm(range(len(text_in))):


  0%|          | 0/1000 [00:00<?, ?it/s]

In [19]:
update_df(df, updates)
df

Unnamed: 0,sentence,emotion
0,happens gold safe,4
1,natural get cold foot,8
2,lucky,7
3,little anxious get whoop et as,2
4,think know affair government official,1
...,...,...
995,ask careful,4
996,like jazz pal,7
997,put,1
998,ever imagine person spot like,7


In [20]:
word_count(df['sentence'])[:20]

person      113
know         43
get          38
got          36
like         35
want         35
look         31
come         27
right        25
go           24
well         24
think        22
time         22
good         21
see          21
tell         20
one          19
u            19
location     18
gonna        18
dtype: int64

# -------------------------------------------------------------------
## LR

In [None]:
#### linear regression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression

vectorizer = TfidfVectorizer(stop_words="english", max_df=0.8)
X = vectorizer.fit_transform(df["sentence"])  
Y = df["emotion"]

#from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X, Y)

regr.score(X, Y)

In [None]:
df["emotion predicted"] = regr.predict(X)

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.05, shuffle=True)

In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.8)
X_train = vectorizer.fit_transform(train["sentence"])  
Y_train = train["emotion"]

In [None]:
regr = LinearRegression()
regr.fit(X_train, Y_train)

In [None]:
regr.score(X_train, Y_train)

In [None]:
X_test = vectorizer.transform(test["sentence"])

In [None]:
Y_test = regr.predict(X_test)

In [None]:
regr = LinearRegression()
regr.fit(X_test, Y_test)

In [None]:
regr.score(X_test, Y_test)

# -------------------------------------------------------------------
## SVC

In [None]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Activation, Flatten, Input, concatenate, Conv1D, GlobalMaxPooling1D, MaxPooling1D

In [26]:
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding

In [36]:
from tensorflow.keras.preprocessing import sequence

In [21]:
df_1 = df.copy()
df_1.head()

Unnamed: 0,sentence,emotion
0,happens gold safe,4
1,natural get cold foot,8
2,lucky,7
3,little anxious get whoop et as,2
4,think know affair government official,1


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_1, test_size=0.2)

In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.8)
X_train = vectorizer.fit_transform(df_train["sentence"])  
Y_train = df_train["emotion"]

In [32]:
X = df_1.sentence
Y = df_1.emotion

In [33]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [37]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [38]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [39]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

NotImplementedError: Cannot convert a symbolic Tensor (lstm/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [40]:
import numpy; print(numpy.__version__)

1.21.5


In [None]:
from sklearn.svm import SVC
wclf = SVC(kernel='linear', C= 1, class_weight={1: 10})
wclf.fit(X, Y)
weighted_prediction = wclf.predict(X_test)
print('Accuracy:', accuracy_score(Y_test, weighted_prediction))
print('F1 score:', f1_score(Y_test, weighted_prediction, average='weighted'))
print('Recall:', recall_score(Y_test, weighted_prediction, average='weighted'))
print('Precision:', precision_score(Y_test, weighted_prediction, average='weighted'))
print('\n clasification report:\n', classification_report(Y_test, weighted_prediction))
print('\n confussion matrix:\n',confusion_matrix(Y_test, weighted_prediction))

In [None]:
### KNN
from sklearn.neighbors import KNeighborsClassifier

# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)

# Fit the classifier to the data
knn.fit(X_train,Y_train)

In [None]:
#show first 5 model predictions on the test data
knn.predict(X_test)