In [1]:
import pandas as pd
import numpy as np
import re
import os
import re
import emoji
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [3]:
# Competition metric
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [28]:
display(df.head())
print(df['sentiment'].unique())
print(df.shape)
df.dropna(inplace=True)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


['neutral' 'negative' 'positive']
(27480, 4)


# Additional task*
___________
Lets predict sentiment for these sentences

In [30]:
def text_preprocessing(text):
    
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)

    return text

In [52]:
df['text'] = df['text'].apply(text_preprocessing)
test['text'] = test['text'].apply(text_preprocessing)

## Vocabulary

In [1]:
class Vocabulary(object):
    """Class for text convertation to digits in forvard and reverse order
    """
    
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """Parameters
        ----------
        token_to_idx : dict
            dictionary with token: index
        add_unk : bool
            is adding UNK token if needed
        unk_token : str
            unknown token version
        Returns
        -------
        """
        
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._token_to_idx = {idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serializable(self):
        """Returns dictionary with ability to order"""
        
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}
    
    @classmethod
    def from_serializable(cls, contents):
        """Creates class instance"""
        return cls(**contents)
    
    def add_token(self, token):
        """Updates dictionary by adding token
        Parameters
        ----------
        token : str
            token to add

        Returns:
        index: int
            token's index
        -------
        """
        
        if token in self.token_to_idx:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """Search for token's index (Returns <UNK>'s index if its not in dict')
        Parameters
        ----------
        token : str
            token to seearch

        Returns:
        index: int
            token's index
        """
        
        if self.add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        """Search for token via index
        """
        
        if index not in self._idx_to_token:
            raise KeyError('the index (%d) is not in the Vocabulary' % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return '<Vocabulary(size=%d)>' % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

# Vectorizer

In [None]:
class Vectorizer(object):
    """Vectorizer for both token_to_idx and index_to_token dictionaries
    """
    
    def __init__(self, text_vocab, target_vocab):
        """
        Parameters
        ----------
        text_vocab : (Vocabulary)
            convert text to digits
        target_vocab : (Vocabulary)
            convert target to digits
        -------
        """
        
        self.text_vocab = text_vocab
        self.target_vocab = target_vocab
        
    def vectorize(self, text):
        """Creates vector for text
        
        Parameters
        ----------
        text_vocab : (Vocabulary)
            convert text to digits
        """

# DataLoader