# Feature Extraction

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [2]:
train = pd.read_csv('./Twitter_Sentiment/train_E6oV3lV.csv')
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


### Finding the number of words in each tweet

In [3]:
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


### Finding the number of characters in each tweet

In [4]:
train['char_count'] = train['tweet'].str.len() ## Also includes spaces as characters
train[['tweet', 'char_count']].head()

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39


### Finding the average word length in each tweet

In [5]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet', 'avg_word']].head()

Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


### Number of Stopwords

In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['tweet', 'stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [7]:
import nltk

### Finding the number of Special Characters

In [8]:
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet', 'hastags']].head()

Unnamed: 0,tweet,hastags
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


### Finding the number of numerics

In [9]:
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet', 'numerics']].head()

Unnamed: 0,tweet,numerics
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


### Finding the number of uppercase words

In [10]:
train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet', 'upper']].head()

Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


# Pre-processing

### transform tweets into lower case

In [11]:
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

### Removing punctuation

In [12]:
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

### Removal of Stop Words

In [13]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Common Word Removal

In [14]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
freq

user     17473
love      2647
ð         2511
day       2199
â         1797
happy     1663
amp       1582
im        1139
u         1136
time      1110
dtype: int64

In [15]:
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Rare Word Removal

In [16]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq

âcurves              1
auntoine             1
amazingits           1
customisedâ          1
putrajayaprihatin    1
alimallcineplex      1
yasssss              1
745                  1
womensissues         1
galaweek             1
dtype: int64

In [17]:
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Spelling Corrections

In [18]:
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

### Tokenization

In [19]:
TextBlob(train['tweet'][1]).words

WordList(['thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'])

### Stemming

In [20]:
# Removing suffices like "ing", "ly", "s", etc.
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

### Lemmatization

In [21]:
# converts the word into its root word, rather than stripping suffices. better than stemming.
from textblob import Word
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

## Advanced Text Processing

### N-grams

In [22]:
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drag']),
 WordList(['drag', 'kid']),
 WordList(['kid', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

### Term Frequency

In [23]:
# Term frequency is the ratio of the count of a word present in a sentence, to the length of the sentence.
# TF = (number of times term T appears in particular row) / (number of terms in that row)
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words', 'tf']
tf1

Unnamed: 0,words,tf
0,thanks,1
1,disapointed,1
2,pdx,1
3,offer,1
4,dont,1
5,getthanked,1
6,cause,1
7,use,1
8,credit,1
9,lyft,1


### Inverse Document Frequency

In [24]:
# The intuition behind inverse document frequency (IDF) is that a word is not of much use to us if it’s appearing in all the documents.
# IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present.
# the higher the idf value, the more unique the word is

for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,thanks,1,4.597751
1,disapointed,1,10.372303
2,pdx,1,8.762865
3,offer,1,6.522155
4,dont,1,3.745585
5,getthanked,1,9.679156
6,cause,1,5.690172
7,use,1,3.552287
8,credit,1,7.327781
9,lyft,1,8.762865


### Term Frequency - Inverse Document Frequency (TF - IDF)

In [25]:
# Basically TF-IDF is the multiplication of TF and IDF calculated in the previous two steps
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,thanks,1,4.597751,4.597751
1,disapointed,1,10.372303,10.372303
2,pdx,1,8.762865,8.762865
3,offer,1,6.522155,6.522155
4,dont,1,3.745585,3.745585
5,getthanked,1,9.679156,9.679156
6,cause,1,5.690172,5.690172
7,use,1,3.552287,3.552287
8,credit,1,7.327781,7.327781
9,lyft,1,8.762865,8.762865


### Bag of Words

In [26]:
# refers to the representation of text which describes the presence of words within the text data.
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1), analyzer = 'word')
train_bow = bow.fit_transform(train['tweet'])
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 128383 stored elements in Compressed Sparse Row format>

### Sentiment Analysis

In [27]:
# Checking the sentiment of the first 5 tweets
train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (-0.3, 0.5354166666666667)
1                    (0.2, 0.2)
2                    (0.0, 0.0)
3                    (0.0, 0.0)
4                    (0.0, 0.0)
Name: tweet, dtype: object

In [28]:
# The closer the value to 1 means a positive sentiment and close to -1 means a negative sentiment
train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0])
train[['tweet', 'sentiment']].head()

Unnamed: 0,tweet,sentiment
0,father dysfunctional selfish drag kid dysfunct...,-0.3
1,thanks lyft credit cant use cause dont offer w...,0.2
2,bihday majesty,0.0
3,model take urð ðððð ððð,0.0
4,factsguide society motivation,0.0
