# NLP - Preparing The Data/Feature Engineering

For convenience, the NLP Cleaning code is copied here:

In [None]:
# Reading in the data

import pandas as pd
train = pd.read_csv('/Users/helenabelloff/Desktop/NLP/train.csv')
train.head()

# Remove Hyperlinks

def hyperlink_remove(text):
    hyperlink_remove = train['text'].str.replace('http\S+|http.\S+', '', case=False)
    return hyperlink_remove

train['hyperlink_remove'] = hyperlink_remove(train['text'])

# Removing punctuation

import string
string.punctuation

def remove_punct(text):
    nopunct = "".join([char for char in text if char not in string.punctuation])
    return nopunct

train['text_no_punct'] = train['hyperlink_remove'].apply(lambda x: remove_punct(x))

# Remove digits

import string

train['text_no_punct'] = train['text_no_punct'].str.replace(r"\d","", regex= True)

# Text to lower

def text_lower(text):
    text = text.lower()
    return text
train['text_no_punct_lower'] = train['text_no_punct'].apply(lambda x: text_lower(x))

# Getting rid of accents etc.

import unicodedata
train['text_no_punct_lower'] = train['text_no_punct_lower'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# Trying to standardize words

from itertools import product
import itertools

train['text_no_punct_lower'] = train['text_no_punct_lower'].apply(lambda x:''.join(''.join(s)[:2] for _, s in itertools.groupby(x)))

# Tokenizing

import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

train['tokenized_text'] = train['text_no_punct_lower'].apply(lambda x: tokenize(x.lower()))

stopword = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 
            'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 
            'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 
            'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 
            'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 
            'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 
            'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 
            'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 
            'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 
            'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 
            'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 
            'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 
            'was', 'here', 'than', 'amp', 'wa', 'am', 'pm', 'im', 'leh', 'ind', 'inciweb', 'ina'}

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

train['tokenized_nostopwords'] = train['tokenized_text'].apply(lambda x: remove_stopwords(x))

# Need to remove stems like "ing" and "ly" - destemming

import nltk

port = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [port.stem(word) for word in tokenized_text]
    return text

train['tokenized_destemmed'] = train['tokenized_nostopwords'].apply(lambda x: stemming(x))

# Lemmatizing

import nltk

lemmatizer = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [lemmatizer.lemmatize(word) for word in tokenized_text]
    return text

train['token_lemmatized'] = train['tokenized_nostopwords'].apply(lambda x: lemmatizing(x))

# Joining Text

def join_text(text):
    text = ' '.join(text)
    return text
train['token_lemmatized'] = train['token_lemmatized'].apply(lambda x: join_text(x))



**Preparing the Data**

In [None]:
# Define body

x = train['token_lemmatized']
print(x.shape)

In [None]:
# Define labels

y = train['target']
print(y.shape)

In [None]:
# Bag of Words using Tfidf

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import ShuffleSplit
import pandas as pd
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV
import numpy as np


cv = CountVectorizer()

# Checking word count
word_count = cv.fit_transform(x)
#print(word_count)

# Ok, word count shape checks out
#print(word_count.shape)

# Now, compute IDF values
tfidf_trans = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_trans.fit(word_count)

# Print idf values
idf_df = pd.DataFrame(tfidf_trans.idf_, index = cv.get_feature_names(), columns = ["idf_weights"])
 
# Sort ascending
idf_df.sort_values(by = ['idf_weights'])

count_vector = cv.transform(x)
 
# tf-idf scores
# Computing the tf * idf  multiplication where term frequency is weighted by its IDF values
tfidf_vector = tfidf_trans.transform(count_vector)


feature_names = cv.get_feature_names()

tfidf = TfidfVectorizer()

X1 = cv.fit_transform(x)
X = tfidf.fit_transform(x)
print(X.shape)

train_df = pd.DataFrame(columns = feature_names, data = X.toarray())
print(train_df.shape)
print(train_df.head(10))

# Splitting the data into train and test (validation)

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=42)

In [None]:
# Creating Final Data Frame

train.reset_index(inplace=True, drop = True)
train.drop(['target','text', 'text_no_punct', 'tokenized_text', 'hyperlink_remove', 'tokenized_nostopwords', 
            'tokenized_destemmed', 'token_lemmatized', 'text_no_punct_lower'],axis = 1, inplace = True)
train = pd.concat([train,train_df], axis =1)
print(train.shape)
train.head(10)

In [None]:
train['keyword'] = train['keyword'].fillna('NA')

**Feature Engineering**

In [None]:
# Let's create a feature for the length of the tweet and the percentage of punctuation in the tweet

import string

train['tweet_length'] = train['text'].apply(lambda x: len(x) - x.count(" "))
train.head(10)

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

train['tweet_punct_percent'] = train['text'].apply(lambda x: count_punct(x))
train.head(10)

In [None]:
# Now let's plot these features

import numpy as np
import matplotlib.pyplot as plt

bins = np.linspace(0, 200, 40)

plt.hist(train[train['target']==1]['tweet_length'], bins, alpha = 0.5, normed = True, label = 'Real Disaster Tweet')
plt.hist(train[train['target']==0]['tweet_length'], bins, alpha = 0.5, normed = True, label = 'Fake Disaster Tweet')
plt.legend(loc = 'upper left')
plt.ylabel('Frequency')
plt.xlabel('Tweet Length')
plt.suptitle('Histogram of Tweet Length')
plt.show

In [None]:
import numpy as np
import matplotlib.pyplot as plt

bins = np.linspace(0, 50, 40)

plt.hist(train[train['target']==1]['tweet_punct_percent'], bins, alpha = 0.5, normed = True, label = 'Real Disaster Tweet')
plt.hist(train[train['target']==0]['tweet_punct_percent'], bins, alpha = 0.5, normed = True, label = 'Fake Disaster Tweet')
plt.legend(loc = 'upper right')
plt.ylabel('Frequency')
plt.xlabel('% of Punctuation in Tweet')
plt.suptitle('Histogram of the Percent of Punctuation in Tweet')
plt.show

**Word Clouds**

In [None]:
#git clone https://github.com/amueller/word_cloud.git
#cd word_cloud
#pip install .

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
% matplotlib inline

# Combining all texts for the Real Disaster Tweets only
text = " ".join(review for review in train[train['target']==1]['text_no_punct'])
print ("There are {} words in real Disaster Tweets.".format(len(train[train['target']==1]['text_no_punct'])))

wordcloud = WordCloud(background_color="white").generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
% matplotlib inline

# Combining all texts for the Fake Disaster Tweets only
text = " ".join(review for review in train[train['target']==0]['text_no_punct'])
print ("There are {} words in Fake Disaster Tweets.".format(len(train[train['target']==0]['text_no_punct'])))

wordcloud = WordCloud(background_color="white").generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Let's make these in the shape of the twitter logo !

# Will be all 0's
twitter_mask = np.array(Image.open("/Users/helenabelloff/Desktop/Twitter.png"))
twitter_mask

In [None]:
def transform_twitter(val):
    if val == 0:
        return 255
    else:
        return val

In [None]:
import numpy as no

transformed_twitter_mask = np.ndarray((twitter_mask.shape[0], twitter_mask.shape[1]), np.int32)

for i in range(len(twitter_mask)):
    transformed_twitter_mask[i] = list(map(transform_twitter, twitter_mask[i]))

In [None]:
# Check that it's now all 255's

transformed_twitter_mask

In [None]:
# Now we redo for our shape!

import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
% matplotlib inline

# Combining all texts for the Real Disaster Tweets only
text = " ".join(review for review in train[train['target']==1]['text_no_punct'])
print ("There are {} words in Real Disaster Tweets.".format(len(train[train['target']==1]['text_no_punct'])))

wordcloud = WordCloud(stopwords = stopword, background_color = "white", mask = transformed_twitter_mask).generate(text)

plt.figure(figsize=[20,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
% matplotlib inline

# Combining all texts for the Fake Disaster Tweets only
text = " ".join(review for review in train[train['target']==0]['text_no_punct'])
print ("There are {} words in Fake Disaster Tweets.".format(len(train[train['target']==0]['text_no_punct'])))

wordcloud = WordCloud(stopwords = stopword, background_color = "white", mask = transformed_twitter_mask).generate(text)

plt.figure(figsize=[20,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()