In [1]:
import pandas as pd
import numpy as np

import nltk
import re
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('tagsets')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.metrics import classification_report


In [2]:
#download data
sentence = pd.read_csv("Data-20220517/training_set.txt",sep='\t',header=0)
dev_set = pd.read_csv("Data-20220517/dev_set.txt",sep='\t',header=0)
test_set = pd.read_csv("Data-20220517/test_set.txt",sep='\t',header=0)
sentence.head()

Unnamed: 0,sentence,emotion
0,I'm too old to be traded in .,6
1,Mother said you could always tell a lady by he...,8
2,I always said I'd leave off when the time came .,6
3,He'll be safe with me .,2
4,Lay off .,1


In [3]:
#define preprocessing function that will be used in the TfidfVectorizer
def preprocessing_tok(text, POS=False,baseline=False):
    '''
        This function do all the steps of preprocessing of text. It is passed in the TF-IDF vectorizer and
        receives a text and return a list with token pre processed:
        
        Input: 'I'm too old to be traded in .'
        Output: ['old', 'traded']
    '''

    processed_corpus = []
    stop_words = set(stopwords.words("english"))
    stop_words.remove('not')

    # Convert to lowercase
    text = text.lower()        
    
     # Remove punctuations
    if baseline:
        text = re.sub('(\[location\]|\[person\]|[^a-zA-Z])', ' ', text)
    else:
         text = re.sub('[^a-zA-Z?!]', ' ', text)
    
    if POS: 
        # POS Tagging
        text = nltk.pos_tag(nltk.word_tokenize(text))
        text = " ".join(" ".join(x) for x in text)
    else:
        # Convert to list from string
        text = text.split()
    
    
    # Lemmatization
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in stop_words] 
        
    return text

## Creating Classifier

In [4]:
#transforme a corpus into a Tfidf matrix (n,m) where n is the number of texts and m the words
TfidfVec = TfidfVectorizer(
            max_df=0.9,
            max_features=10000, 
            ngram_range=(1,3),
            tokenizer=preprocessing_tok
            )

#Tfidf matrix of training set
Tfidf_matrix = TfidfVec.fit_transform(sentence.sentence.values)
#Tfidf matrix of validation set
Tfidf_matrix_val = TfidfVec.transform(dev_set.sentence.values)
#Tfidf matrix of test set
Tfidf_matrix_test = TfidfVec.transform(test_set.sentence.values)

clf = SVC(kernel='linear')

#Training arrays
X= Tfidf_matrix.toarray()
y = sentence.emotion.values
clf.fit(X,y)

#Validation arrays
x_val = Tfidf_matrix_val.toarray()
y_val = dev_set.emotion.values
y_val_pred = clf.predict(x_val)


#Test arrays
x_tes = Tfidf_matrix_test.toarray()
y_tes_pred = clf.predict(x_tes)



test_set['results'] = y_tes_pred
dev_set['emotion'] = y_val_pred
dev_set.to_csv('dev_results.txt', sep='\t', index=False)

test_set.to_csv('test_resukts.txt', sep='\t', index=False)