# NLP Bootcamp Graded Datathon

## Train Data

In [1]:
import pandas as pd
import numpy as np
#Import svm model
from sklearn import svm

In [2]:
### Data
df=pd.read_csv('Train_Dataset.csv')
df_test_final=pd.read_csv('Test_Dataset.csv')

In [3]:
df_sample=df.copy()

In [4]:
from sklearn.model_selection import train_test_split
df_train, df_test, y_train, y_test = train_test_split(df_sample['headline'], df_sample['is_sarcastic'],random_state=42)

In [5]:
### Funcion que hara el pre-processing
import nltk
import contractions
import re

### Usamos la dada

# remove some stopwords to capture negation in n-grams if possible
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')

# load up a simple porter stemmer - nothing fancy
ps = nltk.porter.PorterStemmer()

def simple_text_preprocessor(document): 
    # lower case
    document = str(document).lower()
    
    # expand contractions
    document = contractions.fix(document)
    
    # remove unnecessary characters
    document = re.sub(r'[^a-zA-Z]',r' ', document)
    document = re.sub(r'nbsp', r'', document)
    document = re.sub(' +', ' ', document)
    
    # simple porter stemming
    document = ' '.join([ps.stem(word) for word in document.split()])
    
    # stopwords removal
    document = ' '.join([word for word in document.split() if word not in stop_words])
    
    return document

stp = np.vectorize(simple_text_preprocessor)

In [6]:
df_train=pd.DataFrame(df_train)
df_test=pd.DataFrame(df_test)
## Tokenizamos y medimos sentimiento
import string
df_train['char_count'] = df_train['headline'].apply(len)
df_train['word_count'] = df_train['headline'].apply(lambda x: len(x.split()))
df_train['word_density'] = df_train['char_count'] / (df_train['word_count']+1)
df_train['punctuation_count'] = df_train['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df_train['title_word_count'] = df_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df_train['upper_case_word_count'] = df_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))


## Test

df_test['char_count'] = df_test['headline'].apply(len)
df_test['word_count'] = df_test['headline'].apply(lambda x: len(x.split()))
df_test['word_density'] = df_test['char_count'] / (df_test['word_count']+1)
df_test['punctuation_count'] = df_test['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df_test['title_word_count'] = df_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df_test['upper_case_word_count'] = df_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [7]:
## Incorporamos
df_train['clean_headline']=stp(df_train['headline'].values)
df_train.head()
## Incorporamos
df_test['clean_headline']=stp(df_test['headline'].values)
df_test.head()

Unnamed: 0,headline,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,clean_headline
12782,north dakota not heard from in 48 hours,39,8,4.333333,0,0,0,north dakota not heard hour
42915,report: it going to take way more than an inco...,106,19,5.3,1,0,0,report go take way inconceiv act violenc count...
33043,states' rights rancher ryan bundy to run for n...,60,10,5.454545,1,0,0,state right rancher ryan bundi run nevada gove...
1121,watching thousands march in his honor unlocks ...,85,13,6.071429,2,0,0,watch thousand march hi honor unlock deeper da...
38782,"during the debate, these two did the unthinkab...",71,12,5.461538,1,0,0,dure debat two unthink unit countri


In [8]:
### textblob
import textblob
### Agregamos polarizacion
x_train_snt_obj = df_train['clean_headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
df_train['Polarity'] = [obj.polarity for obj in x_train_snt_obj.values]
df_train['Subjectivity'] = [obj.subjectivity for obj in x_train_snt_obj.values]

x_test_snt_obj = df_test['clean_headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
df_test['Polarity'] = [obj.polarity for obj in x_test_snt_obj.values]
df_test['Subjectivity'] = [obj.subjectivity for obj in x_test_snt_obj.values]

In [9]:
import nltk
from gensim.models import word2vec

from gensim.models.fasttext import FastText

# Set values for various parameters
feature_size =40  # Word vector dimensionality  
window_context = 20  # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3        # Downsample setting for frequent words
sg = 1               # skip-gram model
tokenized_corpus_train = [nltk.word_tokenize(doc) for doc in df_train.clean_headline]
tokenized_corpus_test = [nltk.word_tokenize(doc) for doc in df_test.clean_headline]
print('es aqui')
ft_model_train = FastText(tokenized_corpus_train, vector_size=feature_size, 
                     window=window_context, min_count = min_word_count,
                     sg=sg, sample=sample, epochs=1000)
print('llego')
ft_model_test = FastText(tokenized_corpus_test, vector_size=feature_size, 
                     window=window_context, min_count = min_word_count,
                     sg=sg, sample=sample, epochs=1000)
ft_model_test

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key )
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

# get document level embeddings
ft_doc_features_train = averaged_word_vectorizer(corpus=tokenized_corpus_train, model=ft_model_train,
                                             num_features=feature_size)
ft_doc_features_test = averaged_word_vectorizer(corpus=tokenized_corpus_test, model=ft_model_test,
                                             num_features=feature_size)
pd.DataFrame(ft_doc_features_train)

es aqui
llego


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.303810,0.774554,-0.673557,-0.268684,-0.260771,0.066795,0.172779,-0.403031,0.408120,0.226385,...,-0.231208,-0.388830,-0.059136,0.295165,0.101622,0.135062,0.355925,0.145863,0.089161,-0.193855
1,0.044506,0.293990,-0.488525,-0.092189,-0.205447,0.203497,0.295079,-0.203258,0.111602,0.042084,...,0.045776,-0.155665,0.240640,0.177495,0.147358,-0.313848,-0.032647,-0.324034,-0.253415,0.042611
2,-0.655577,0.168032,-1.016943,-0.021515,-0.078975,-0.326722,0.105642,-0.360011,0.084657,-0.277446,...,-0.133649,0.288467,0.378292,0.163612,-0.320457,0.301191,-0.567193,0.421744,0.052206,-0.327750
3,-0.237299,1.083139,-0.817316,0.221413,-0.772712,-0.274603,-0.158052,-0.393026,-0.237888,-0.431125,...,-0.120174,-0.102359,0.137160,-0.616942,-0.298536,-1.019961,-0.418451,-0.285799,0.009022,-0.156521
4,0.015390,-0.026583,-0.642687,-0.007902,-0.234826,-0.231822,-0.292112,-0.082536,0.147594,0.041181,...,-0.023186,0.054781,0.116169,0.133509,-0.127743,0.074932,-0.102889,-0.311851,-0.086333,-0.034309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33191,-0.528557,0.055446,-0.511212,-0.099616,-0.415503,-0.467024,0.317173,-0.806385,0.248347,-0.324204,...,0.257286,-0.155096,0.151183,0.623935,-0.903026,0.988929,-0.605757,0.098954,0.221164,-0.674599
33192,-0.465899,0.216865,-1.110556,0.454970,-0.016452,0.106102,0.675830,-0.342329,0.184576,-0.214454,...,-0.065440,0.452247,0.540340,-0.089920,0.150541,0.441495,-0.046136,0.021386,0.946875,-0.043862
33193,-0.313827,0.272641,-0.463410,-0.125754,-0.501690,-0.360684,0.383413,-0.404789,-0.313446,-0.120449,...,0.174827,0.002461,0.131856,0.422402,-0.740867,-0.422372,-0.188971,0.102957,0.146369,-0.117667
33194,-0.327128,0.200882,-0.033271,-0.310661,-0.472285,0.071983,0.132854,-0.363859,0.036887,-0.286393,...,-0.077401,-0.002415,0.465520,0.070317,0.089489,-0.036740,-0.211541,0.014745,-0.327120,-0.198710


In [10]:
df_train.loc[:,pd.DataFrame(ft_doc_features_train).columns] = ft_doc_features_train
df_test.loc[:,pd.DataFrame(ft_doc_features_test).columns] = ft_doc_features_test


In [11]:
#### Train model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=42, solver='liblinear')


In [12]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(df_train.drop(['clean_headline','headline'], axis=1).fillna(0), y_train)

In [13]:
from sklearn.metrics import confusion_matrix, classification_report

In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=15, random_state=0)
clf.fit(df_train.drop(['clean_headline','headline'], axis=1).fillna(0), y_train)

RandomForestClassifier(max_depth=15, random_state=0)

In [15]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda=LinearDiscriminantAnalysis()
lda.fit(df_train.drop(['clean_headline','headline'], axis=1).fillna(0), y_train)


LinearDiscriminantAnalysis()

In [16]:
#Create a svm Classifier
svm_clf = svm.SVC(kernel='linear') # Linear Kernel
svm_clf.fit(df_train.drop(['clean_headline','headline'], axis=1).fillna(0), y_train)

SVC(kernel='linear')

In [17]:
lr.fit(df_train.drop(['clean_headline','headline'], axis=1).fillna(0), y_train)
predictions = lr.predict(df_test.drop(['clean_headline','headline'], axis=1))
predictions2 = clf.predict(df_test.drop(['clean_headline','headline'], axis=1))
predictions3 = lda.predict(df_test.drop(['clean_headline','headline'], axis=1))
predictions4 = svm_clf.predict(df_test.drop(['clean_headline','headline'], axis=1))
predictions5 = gb.predict(df_test.drop(['clean_headline','headline'], axis=1))
print('Logistic: \n',classification_report(y_test, predictions))
print('Random Forest: \n',classification_report(y_test, predictions2))
print('LDA: \n',classification_report(y_test, predictions3))
print('SVM: \n',classification_report(y_test, predictions4))
print('GB: \n',classification_report(y_test, predictions5))

Logistic: 
               precision    recall  f1-score   support

           0       0.66      0.73      0.69      5947
           1       0.64      0.56      0.60      5119

    accuracy                           0.65     11066
   macro avg       0.65      0.64      0.64     11066
weighted avg       0.65      0.65      0.65     11066

Random Forest: 
               precision    recall  f1-score   support

           0       0.66      0.70      0.68      5947
           1       0.63      0.57      0.60      5119

    accuracy                           0.64     11066
   macro avg       0.64      0.64      0.64     11066
weighted avg       0.64      0.64      0.64     11066

LDA: 
               precision    recall  f1-score   support

           0       0.66      0.72      0.69      5947
           1       0.64      0.57      0.60      5119

    accuracy                           0.65     11066
   macro avg       0.65      0.64      0.65     11066
weighted avg       0.65      0.65     

In [18]:
## Neural Network
X=df_train.drop(['clean_headline','headline'], axis=1).fillna(0).to_numpy()
Y= y_train.to_numpy()
# modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

# for modeling
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

# build a model
model = Sequential()
model.add(Dense(16, input_shape=(X.shape[1],), activation='relu')) # Add an input shape! (features,)
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 

# compile the model
model.compile(optimizer='Adam', 
              loss='binary_crossentropy',
              metrics=['accuracy'])

# early stopping callback
# This callback will stop the training when there is no improvement in  
# the validation loss for 10 consecutive epochs.  
es = EarlyStopping(monitor='val_accuracy', 
                                   mode='max', # don't minimize the accuracy!
                                   patience=10,
                                   restore_best_weights=True)

# now we just update our model fit call
history = model.fit(X,
                    Y,
                    callbacks=[es],
                    epochs=1000, # you can set this to a big number!
                    batch_size=10,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

X_test=df_test.drop(['clean_headline','headline'],axis=1).to_numpy()
# see how these are numbers between 0 and 1? 
model.predict(X_test) # prob of successes (survival)



# so we need to round to a whole number (0 or 1),
# or the confusion matrix won't work!
preds = np.round(model.predict(X),0)

# confusion matrix
print(confusion_matrix(Y, preds)) # order matters! (actual, predicted)


print(classification_report(Y, preds))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                784       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,073
Trainable params: 1,073
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch

## Test Data

In [19]:
### Paso 1

### Funcion que hara el pre-processing
import nltk
import contractions
import re

### Usamos la dada

# remove some stopwords to capture negation in n-grams if possible
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')

# load up a simple porter stemmer - nothing fancy
ps = nltk.porter.PorterStemmer()

def simple_text_preprocessor(document): 
    # lower case
    document = str(document).lower()
    
    # expand contractions
    document = contractions.fix(document)
    
    # remove unnecessary characters
    document = re.sub(r'[^a-zA-Z]',r' ', document)
    document = re.sub(r'nbsp', r'', document)
    document = re.sub(' +', ' ', document)
    
    # simple porter stemming
    document = ' '.join([ps.stem(word) for word in document.split()])
    
    # stopwords removal
    document = ' '.join([word for word in document.split() if word not in stop_words])
    
    return document

stp = np.vectorize(simple_text_preprocessor)

## Paso2

df_train=pd.DataFrame(df) ### Estrenamos todo el trainning set 
df_test=pd.DataFrame(df_test_final)
## Tokenizamos y medimos sentimiento
import string
df_train['char_count'] = df_train['headline'].apply(len)
df_train['word_count'] = df_train['headline'].apply(lambda x: len(x.split()))
df_train['word_density'] = df_train['char_count'] / (df_train['word_count']+1)
df_train['punctuation_count'] = df_train['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df_train['title_word_count'] = df_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df_train['upper_case_word_count'] = df_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))


## Test

df_test['char_count'] = df_test['headline'].apply(len)
df_test['word_count'] = df_test['headline'].apply(lambda x: len(x.split()))
df_test['word_density'] = df_test['char_count'] / (df_test['word_count']+1)
df_test['punctuation_count'] = df_test['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df_test['title_word_count'] = df_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df_test['upper_case_word_count'] = df_test['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

## Incorporamos
df_train['clean_headline']=stp(df_train['headline'].values)
df_train.head()
## Incorporamos
df_test['clean_headline']=stp(df_test['headline'].values)
df_test.head()

## Incorporamos
df_train['clean_headline']=stp(df_train['headline'].values)
df_train.head()
## Incorporamos
df_test['clean_headline']=stp(df_test['headline'].values)
df_test.head()

df_train.loc[:,pd.DataFrame(ft_doc_features_train).columns] = ft_doc_features_train
df_test.loc[:,pd.DataFrame(ft_doc_features_test).columns] = ft_doc_features_test

ValueError: Length of values (33196) does not match length of index (44262)

In [None]:
## resultado

## Neural Network
X=df_train.drop(['clean_headline','headline'], axis=1).fillna(0).to_numpy()
Y= y_train.to_numpy()
# modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

# for modeling
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

# build a model
model = Sequential()
model.add(Dense(16, input_shape=(X.shape[1],), activation='relu')) # Add an input shape! (features,)
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 

# compile the model
model.compile(optimizer='Adam', 
              loss='binary_crossentropy',
              metrics=['accuracy'])

# early stopping callback
# This callback will stop the training when there is no improvement in  
# the validation loss for 10 consecutive epochs.  
es = EarlyStopping(monitor='val_accuracy', 
                                   mode='max', # don't minimize the accuracy!
                                   patience=10,
                                   restore_best_weights=True)

# now we just update our model fit call
history = model.fit(X,
                    Y,
                    callbacks=[es],
                    epochs=1000, # you can set this to a big number!
                    batch_size=10,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

X_test=df_test.drop(['clean_headline','headline'],axis=1).to_numpy()
# see how these are numbers between 0 and 1? 
model.predict(X_test) # prob of successes (survival)



# so we need to round to a whole number (0 or 1),
# or the confusion matrix won't work!
preds = np.round(model.predict(X),0)

# confusion matrix
print(confusion_matrix(Y, preds)) # order matters! (actual, predicted)


print(classification_report(Y, preds))


In [None]:
df_test_final['predicted']=preds