# A Visual for before and after text cleaning

using unigram, bigram and trigram
- This is used for 15 k training set data

# 1)- Import key Modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings('ignore')

In [2]:
#https://textblob.readthedocs.io/en/dev/install.html
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger

In [3]:
# for noise removal

import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hassan.sherwani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
import re    # for regular expressions 
import nltk  # for text manipulation 
#For other text data
from collections import Counter
import scattertext as st
import spacy
from pprint import pprint
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')

import string 
import numpy as np 
import pickle
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [5]:
# for dynamic visuals
import plotly as py
import cufflinks as cf

In [6]:
# for offline mode in notebook
py.offline.init_notebook_mode(connected=True)
cf.go_offline()

In [7]:
%reload_ext version_information
%version_information pandas,numpy, nltk, seaborn, matplotlib

Software,Version
Python,3.7.7 64bit [MSC v.1916 64 bit (AMD64)]
IPython,7.13.0
OS,Windows 10 10.0.17763 SP0
pandas,1.0.3
numpy,1.18.1
nltk,3.5
seaborn,0.10.1
matplotlib,3.1.3
Thu Jun 25 19:37:54 2020 W. Europe Daylight Time,Thu Jun 25 19:37:54 2020 W. Europe Daylight Time


# 2)- Load Data

In [8]:
import pickle
#Loading from firm cleaned file
data = pd.read_pickle('model.pkl')
data.shape

(15000, 2)

In [9]:
data.head()

Unnamed: 0,TEXT,Sent_type
143880,"""Over the years, the MLK estate has also licen...",Benefit
87455,Porsche celebrates its racing history with the...,Benefit
103986,"Hmm, could that be the Mercedes? It made a ver...",Risky
261437,The Full Carbon Audi RS6 is simply Amazing pi...,Benefit
205444,Vijay Mayadas explores #blockchain potential i...,Benefit


In [10]:
data.Sent_type.value_counts()

Neutral    6513
Benefit    5912
Risky      2575
Name: Sent_type, dtype: int64

# 3)- Data clean

### 3.1.make patterns

In [11]:
# STOPWORDS

##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

### 3.2.make function

In [12]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http', 'https','r8','pictwittercom','ifttt','1pictwittercom']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwords from text
    text = text.strip()
    return text

### 3.3. apply to main text

In [13]:
data['clean']=[text_prepare(x) for x in data['TEXT']]

### 3.4. Before and after cleaning

In [14]:
data['TEXT'].apply(lambda x: len(x.split(' '))).sum()

274652

In [15]:
data['clean'].apply(lambda x: len(x.split(' '))).sum()

202813

In [16]:
# difference 
274652-202866

71786

### 3.1)- characteristic terms and their associations

In [17]:
corpus = st.CorpusFromPandas(data, category_col='Sent_type', text_col='clean', nlp=nlp).build()
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

['selfdriving', 'bitly', 'mercedesbenz', 'buffly', 'owly', 'dlvrit', 'fbme', 'youtube', 'twittercom', 'wwwinstagramcom']


Above are the terms that differentiate the text messages from a general English corpus

### 3.2)- Association of Benefit Sentiment with corpus

In [18]:
data.Sent_type.unique()

array(['Benefit', 'Risky', 'Neutral'], dtype=object)

In [19]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['benefit_score'] = corpus.get_scaled_f_scores('Benefit')
term_freq_df.sort_values(by='benefit_score', ascending=False)[:10]

Unnamed: 0_level_0,Benefit freq,Risky freq,Neutral freq,benefit_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
winner,49,0,0,1.0
good morning,35,0,0,1.0
congratulations,33,0,0,0.999999
sharing,32,0,0,0.999998
exciting,25,0,0,0.999817
intelligence,23,0,0,0.999473
champion,21,0,0,0.9986
truly,21,0,0,0.9986
beautiful,74,1,0,0.998491
amazing,64,1,0,0.998254


In [20]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['benefit_score'] = corpus.get_scaled_f_scores('Benefit')
pprint(list(term_freq_df.sort_values(by='benefit_score', ascending=False).index[:10]))

['winner',
 'good morning',
 'congratulations',
 'sharing',
 'exciting',
 'intelligence',
 'champion',
 'truly',
 'beautiful',
 'amazing']


**Above are the terms in cleane text that are most associated with the Benefit sentiment**

### 3.3)- Association of Neutral Sentiment with corpus

In [21]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['neutral_score'] = corpus.get_scaled_f_scores('Neutral')
term_freq_df.sort_values(by='neutral_score', ascending=False)[:10]

Unnamed: 0_level_0,Benefit freq,Risky freq,Neutral freq,neutral_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
auctions,2,0,30,1.0
live bat,2,0,29,0.999749
bat auctions,2,0,29,0.999749
bringatrailercom,3,1,38,0.996104
bringatrailercom listing,3,1,38,0.996104
bat,2,2,29,0.992633
corvette,5,0,32,0.990721
biilionaires status,1,1,19,0.990644
biilionaires,1,1,19,0.990644
chevrolet corvette,1,0,17,0.989331


In [22]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['neutral_score'] = corpus.get_scaled_f_scores('Neutral')
pprint(list(term_freq_df.sort_values(by='neutral_score', ascending=False).index[:10]))

['auctions',
 'live bat',
 'bat auctions',
 'bringatrailercom',
 'bringatrailercom listing',
 'bat',
 'corvette',
 'biilionaires status',
 'biilionaires',
 'chevrolet corvette']


### 3.4)- Association of Risk Sentiment with corpus

In [23]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['risky_score'] = corpus.get_scaled_f_scores('Risky')
term_freq_df.sort_values(by='risky_score', ascending=False)[:10]

Unnamed: 0_level_0,Benefit freq,Risky freq,Neutral freq,risky_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fatal,0,26,0,1.0
kills,0,15,0,0.999996
crisis,0,14,0,0.999984
killing,0,14,0,0.999984
fatal crash,0,13,0,0.999942
fired,0,12,0,0.99981
struck,0,12,0,0.99981
emissions cheating,0,12,0,0.99981
blame,0,12,0,0.99981
injured,0,11,0,0.999426


In [24]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['risky_score'] = corpus.get_scaled_f_scores('Risky')
pprint(list(term_freq_df.sort_values(by='risky_score', ascending=False).index[:10]))

['fatal',
 'kills',
 'crisis',
 'killing',
 'fatal crash',
 'fired',
 'struck',
 'emissions cheating',
 'blame',
 'injured']


# 4)-Visualize text data before cleanup

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

### 4.a.plot unigram using plotly

In [26]:
def freq_unigram(x,n):
        vec = CountVectorizer(). fit(x)
        bow = vec.transform(x)
        sum_words = bow.sum(axis=0)
        words_freq = [(word , sum_words[0 , idx]) for word , idx in vec.vocabulary_.items()]
        words_freq = sorted (words_freq , key=lambda x:x[1], reverse=True)
        return words_freq[:n]

In [27]:
freqwords=freq_unigram(data['TEXT'], 20)
df_unigram=pd.DataFrame(freqwords, columns=['freq_unigrams','count'])
df_unigram= df_unigram.set_index('freq_unigrams')
df_unigram

Unnamed: 0_level_0,count
freq_unigrams,Unnamed: 1_level_1
com,11054
the,9561
twitter,8663
to,5751
pic,5680
http,5191
in,4807
driving,4204
self,4007
and,3873


In [28]:
# ploting 
df_unigram.iplot(kind='bar',xTitle='Number of Unigram', yTitle='Count',
                title='Top 20 unigram words before data cleaning')

### 4.b. plot using Bigram

In [29]:
 def freq_bigram(x,n):
        vec = CountVectorizer(ngram_range=(2,2)). fit(x)
        bow = vec.transform(x)
        sum_words = bow.sum(axis=0)
        words_freq = [(word , sum_words[0 , idx]) for word , idx in vec.vocabulary_.items()]
        words_freq = sorted (words_freq , key=lambda x:x[1], reverse=True)
        return words_freq[:n]

In [30]:
freq_bigram=freq_bigram(data['TEXT'], 20)
df_bigram=pd.DataFrame(freq_bigram, columns=['freq_bigrams','count'])
df_bigram= df_bigram.set_index('freq_bigrams')
df_bigram

Unnamed: 0_level_0,count
freq_bigrams,Unnamed: 1_level_1
twitter com,8425
pic twitter,5653
self driving,3638
driving cars,1455
general motors,1211
bit ly,983
http bit,929
http www,901
in the,891
https www,834


In [31]:
# ploting 
df_bigram.iplot(kind='bar',xTitle='Number of bigram', yTitle='Count',
                title='Top 20 bigram words before data cleaning')

#### 4.c. plot using Trigram

In [32]:
 def freq_trigram(x,n):
        vec = CountVectorizer(ngram_range=(3,3)). fit(x)
        bow = vec.transform(x)
        sum_words = bow.sum(axis=0)
        words_freq = [(word , sum_words[0 , idx]) for word , idx in vec.vocabulary_.items()]
        words_freq = sorted (words_freq , key=lambda x:x[1], reverse=True)
        return words_freq[:n]

In [33]:
freq_trigram_wrds=freq_trigram(data['TEXT'], 20)
df_trigram=pd.DataFrame(freq_trigram_wrds, columns=['freq_trigrams','count'])
df_trigram= df_trigram.set_index('freq_trigrams')
df_trigram

Unnamed: 0_level_0,count
freq_trigrams,Unnamed: 1_level_1
pic twitter com,5653
self driving cars,1453
http bit ly,929
self driving car,799
http ow ly,310
for self driving,223
http buff ly,209
of self driving,193
http dlvr it,179
driving cars http,164


In [34]:
# ploting 
df_trigram.iplot(kind='bar',xTitle='Number of trigram', yTitle='Count',
                title='Top 20 trigram words before data cleaning')

# 5)- Visualization after cleanup

### 5.a.Unigram Plot

In [35]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(data['clean'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['Unigram' , 'count'])
df2.groupby('Unigram').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar',xTitle='Number of Unigrams', yTitle='count', title='Top 20 Unigram words after data cleaning')

selfdriving 2738
bmw 2368
porsche 2179
cars 1896
car 1756
motors 1693
mercedes 1606
audi 1459
new 1150
general 1123
bitly 983
driving 970
tesla 773
vw 758
benz 664
self 643
ford 554
volkswagen 539
news 527
mercedesbenz 518


### 5.b.Plot Bigram

In [36]:
#create clean2 for bigram
def preprocess(Text):
    Text = Text.str.replace("(selfdriving cars)", "selfdriving vehicles")
    Text = Text.str.replace("(selfdriving car)", "selfdriving vehicles")
    Text = Text.str.replace("(self driving)", "selfdriving vehicles")
    Text = Text.str.replace("(vehicles cars)", " ")
    Text = Text.str.replace("(vehicles car)", " ")
    return Text
data['clean2'] = preprocess(data['clean'])

In [38]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(data['clean2'], 20)
for word, freq in common_words:
    print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['Bigram' , 'count'])
df3.groupby('Bigram').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar',xTitle='Number of Bigrams', yTitle='count', title='Top 20 Bigram words after data cleaning')

selfdriving vehicles 1862
general motors 1062
tesla motors 363
porsche 911 230
mercedes benz 199
ford motor 145
motor company 88
new bmw 83
bmw i8 82
bmw series 75
elon musk 72
brand new 66
bmw m3 59
mercedesbenz stadium 54
electric cars 51
selfdriving trucks 49
googles selfdriving 47
new porsche 47
bmw m4 47
new audi 47


### 5.c.Plot Trigrams

In [39]:
#create clean3 for trigram
def preprocess(Text):
    Text = Text.str.replace("(self driving cars)", "self driving car")
    Text = Text.str.replace("(general motors gm)", "general motors company") 
    return Text
data['clean3'] = preprocess(data['clean'])

In [40]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(data['clean3'], 20)
for word, freq in common_words:
    print(word, freq)
df5 = pd.DataFrame(common_words, columns = ['trigrams' , 'count'])
df5.groupby('trigrams').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', xTitle='Number of Trigrams', yTitle='count',title='Top 20 Trigram words after data cleaning')

self driving car 295
ford motor company 80
general motors company 73
porsche 911 turbo 35
googles selfdriving car 32
porsche 911 gt3 32
mercedes benz stadium 31
live bat auctions 31
tesla motors tsla 29
selfdriving cars bitly 28
news reportphp id 25
testing selfdriving cars 25
google selfdriving car 25
wwwautosportcom news reportphp 23
general motors ford 22
wwwbloombergcom news articles 22
twittercom biilionaires status 21
selfdriving car project 20
motors company gm 19
future selfdriving cars 19


# END of NOTEBOOK