In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
import pandas as pd
import numpy as np
import nltk
import re
import string
ps = PorterStemmer()
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Reference https://www.kaggle.com/parulpandey/eda-and-preprocessing-for-bert

def clean(tweet):
    tweet = str(tweet)

    tweet=tweet.lower()

    #Remove html tags
    tweet=re.sub('<.*?>','',tweet)

    #Remove text in square brackets
    tweet=re.sub('\[.*?\]','',tweet)

    #Remove hyperlinks
    tweet=re.sub('https?://\S+|www\.\S+','',tweet)
    
    #Remove puntuation
    tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet)

    return tweet

In [3]:
train = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
target = train['sentiment']

In [4]:
train_clean = train.copy()

train_clean['text'] = train_clean['text'].apply(lambda x:clean(x))
train_clean['selected_text'] = train_clean['selected_text'].apply(lambda x:clean(x))

In [5]:
train_clean

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,id have responded if i were going,id have responded if i were going,neutral
1,549e992a42,sooo sad i will miss you here in san diego,sooo sad,negative
2,088c60f138,my boss is bullying me,bullying me,negative
3,9642c003ef,what interview leave me alone,leave me alone,negative
4,358bd9e861,sons of why couldnt they put them on the rel...,sons of,negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband l...,d lost,negative
27477,4f4c4fc327,ive wondered about rake to the client has ma...,dont force,negative
27478,f67aae2310,yay good for both of you enjoy the break you...,yay good for both of you,positive
27479,ed167662a5,but it was worth it,but it was worth it,positive


In [6]:
train_clean = train_clean.dropna()

In [7]:
X_train, X_val = train_test_split(train_clean, test_size = 0.3, random_state = 17)

In [8]:
pos_train = train_clean[train_clean['sentiment'] == 'positive']
neutral_train = train_clean[train_clean['sentiment'] == 'neutral']
neg_train = train_clean[train_clean['sentiment'] == 'negative']

In [9]:
# https://www.kaggle.com/nkoprowicz/a-simple-solution-using-only-word-counts/notebook
# Se usa CountVectorizer para encontrar el conteo de palabras (Bag of words)

cv = CountVectorizer(max_df=0.95, min_df=2,
                                     max_features=10000,
                                     stop_words='english')

X_train_cv = cv.fit_transform(train_clean['text'])

X_pos = cv.transform(pos_train['text'])
X_neutral = cv.transform(neutral_train['text'])
X_neg = cv.transform(neg_train['text'])

pos_count_df = pd.DataFrame(X_pos.toarray(), columns=cv.get_feature_names())
neutral_count_df = pd.DataFrame(X_neutral.toarray(), columns=cv.get_feature_names())
neg_count_df = pd.DataFrame(X_neg.toarray(), columns=cv.get_feature_names())

# Se crea un diccionario por sentimiento con el valor de cada palabras 

pos_words = {}
neutral_words = {}
neg_words = {}

for k in cv.get_feature_names():
    pos = pos_count_df[k].sum()
    neutral = neutral_count_df[k].sum()
    neg = neg_count_df[k].sum()
    
    pos_words[k] = pos/pos_train.shape[0]
    neutral_words[k] = neutral/neutral_train.shape[0]
    neg_words[k] = neg/neg_train.shape[0]

neg_words_adj = {}
pos_words_adj = {}
neutral_words_adj = {}

for key, value in neg_words.items():
    neg_words_adj[key] = neg_words[key] - (neutral_words[key] + pos_words[key])
for key, value in pos_words.items():
    pos_words_adj[key] = pos_words[key] - (neutral_words[key] + neg_words[key])
for key, value in neutral_words.items():
    neutral_words_adj[key] = neutral_words[key] - (neg_words[key] + pos_words[key])

In [10]:
def calculate_selected_text(df_row, tol = 0):
    
    tweet = df_row['text']
    sentiment = df_row['sentiment']
    
    if(sentiment == 'neutral'):
        return tweet
    
    elif(sentiment == 'positive'):
        dict_to_use = pos_words_adj
    elif(sentiment == 'negative'):
        dict_to_use = neg_words_adj
        
    words = tweet.split()
    words_len = len(words)
    subsets = [words[i:j+1] for i in range(words_len) for j in range(i,words_len)]
    
    score = 0
    selection_str = ''
    lst = sorted(subsets, key = len)
    
    
    for i in range(len(subsets)):
        
        new_sum = 0 
        
        for p in range(len(lst[i])):
            if(lst[i][p] in dict_to_use.keys()):
                new_sum += dict_to_use[lst[i][p].translate(str.maketrans('','',string.punctuation))]
            
        if(new_sum > score + tol):
            score = new_sum
            selection_str = lst[i]

    if(len(selection_str) == 0):
        selection_str = words
        
    return ' '.join(selection_str)

In [11]:
tol = 0.001

X_train['predicted_selection'] = ''

for index, row in X_train.iterrows():
    
    selected_text = calculate_selected_text(row, tol)
    
    X_train.loc[X_train['textID'] == row['textID'], ['predicted_selection']] = selected_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [12]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    if ((len(a) + len(b) - len(c)) == 0):
        print(str1)
        print(str2)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [13]:
X_train = X_train[X_train['selected_text'] != '']

In [14]:
X_train['jaccard'] = X_train[X_train['selected_text'] != ''].apply(lambda x: jaccard(x['selected_text'], x['predicted_selection']), axis = 1)

print('The jaccard score for the train set is:', np.mean(X_train['jaccard']))

The jaccard score for the train set is: 0.6697657991769344


In [15]:
tol = 0.001

X_train['predicted_selection'] = ''

for index, row in X_val.iterrows():
    
    selected_text = calculate_selected_text(row, tol)
    
    X_val.loc[X_val['textID'] == row['textID'], ['predicted_selection']] = selected_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [16]:
X_val['jaccard'] = X_val[X_val['selected_text'] != ''].apply(lambda x: jaccard(x['selected_text'], x['predicted_selection']), axis = 1)
print('The jaccard score for the test set is:', np.mean(X_val['jaccard']))

The jaccard score for the test set is: 0.6682965308314733


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
