## Basic Feature extraction

In [1]:
import pandas as pd
import numpy as np

In [2]:
# import data
train = pd.read_csv("twitter_sentiment/train_E6oV3lV.csv")
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
# number of words in each tweet
train["word_count"] = list(map(lambda x: len(str(x).split(" ")), train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet,word_count
0,1,0,@user when a father is dysfunctional and is s...,21
1,2,0,@user @user thanks for #lyft credit i can't us...,22
2,3,0,bihday your majesty,5
3,4,0,#model i love u take with u all the time in ...,17
4,5,0,factsguide: society now #motivation,8


In [4]:
# number of characters
def char_count(sentence):
    l = sentence.split()
    char = 0
    for i in l:
        char = char+len(i)
    return char
train["char_count_w"] = list(map(lambda x: len(x), train["tweet"]))        # count with space.
train["char_count"] = list(map(lambda x: char_count(x), train["tweet"]))   # count without space.
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count_w,char_count
0,1,0,@user when a father is dysfunctional and is s...,21,102,82
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,101
2,3,0,bihday your majesty,5,21,17
3,4,0,#model i love u take with u all the time in ...,17,86,69
4,5,0,factsguide: society now #motivation,8,39,32


In [5]:
# avg word length
def avg_word_len(sentence):
    l = sentence.split()
    char = 0
    for i in l:
        char = char+len(i)
    a_l = char/len(l)
    return a_l
train["avg_word_length"] = list(map(lambda x: avg_word_len(x), train["tweet"]))   # count without space.
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count_w,char_count,avg_word_length
0,1,0,@user when a father is dysfunctional and is s...,21,102,82,4.555556
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,101,5.315789
2,3,0,bihday your majesty,5,21,17,5.666667
3,4,0,#model i love u take with u all the time in ...,17,86,69,4.928571
4,5,0,factsguide: society now #motivation,8,39,32,8.0


In [6]:
# number of stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
def s_count(sentence):
    count = len([word for word in sentence.split() if word in stop])
    return count
            
train["stopwords"] = list(map(lambda x: s_count(x), train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count_w,char_count,avg_word_length,stopwords
0,1,0,@user when a father is dysfunctional and is s...,21,102,82,4.555556,10
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,101,5.315789,5
2,3,0,bihday your majesty,5,21,17,5.666667,1
3,4,0,#model i love u take with u all the time in ...,17,86,69,4.928571,5
4,5,0,factsguide: society now #motivation,8,39,32,8.0,1


In [7]:
# number of uppercase words

def is_upper(sentence):
    count = len([word for word in sentence.split() if word.isupper()])
    return count

train["n_upper"] = list(map(lambda x: is_upper(x),train['tweet']))
train.head()


Unnamed: 0,id,label,tweet,word_count,char_count_w,char_count,avg_word_length,stopwords,n_upper
0,1,0,@user when a father is dysfunctional and is s...,21,102,82,4.555556,10,0
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,101,5.315789,5,0
2,3,0,bihday your majesty,5,21,17,5.666667,1,0
3,4,0,#model i love u take with u all the time in ...,17,86,69,4.928571,5,0
4,5,0,factsguide: society now #motivation,8,39,32,8.0,1,0


In [8]:
# number of digits

def is_digit(sentence):
    count = len([word for word in sentence.split() if word.isdigit()])
    return count

train["n_digit"] = list(map(lambda x: is_upper(x),train['tweet']))
train.head()

Unnamed: 0,id,label,tweet,word_count,char_count_w,char_count,avg_word_length,stopwords,n_upper,n_digit
0,1,0,@user when a father is dysfunctional and is s...,21,102,82,4.555556,10,0,0
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,101,5.315789,5,0,0
2,3,0,bihday your majesty,5,21,17,5.666667,1,0,0
3,4,0,#model i love u take with u all the time in ...,17,86,69,4.928571,5,0,0
4,5,0,factsguide: society now #motivation,8,39,32,8.0,1,0,0


## Data Preprocessing

In [3]:
# transform text into lower case letter
def to_lower(sentence):
    sentence = (" ").join([word.lower() for word in sentence.split()])
    return sentence
train["tweet"] = list(map(lambda x: to_lower(x), train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ur...
4,5,0,factsguide: society now #motivation


In [4]:
# remove punchuation
train["tweet"] = train["tweet"].str.replace("[^\w\s]","")
train.head()

Unnamed: 0,id,label,tweet
0,1,0,user when a father is dysfunctional and is so ...
1,2,0,user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in urð...
4,5,0,factsguide society now motivation


In [5]:
# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
train["tweet"] =list(map(lambda x: " ".join([word for word in x.split() if word not in stop]), train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,user father dysfunctional selfish drags kids d...
1,2,0,user user thanks lyft credit cant use cause do...
2,3,0,bihday majesty
3,4,0,model love u take u time urð ðððð ððð
4,5,0,factsguide society motivation


In [6]:
# Most common word removal
# top 10 frequent word
w_freq = pd.Series(' '.join(train["tweet"]).split()).value_counts()[:10]
train["tweet"] = list(map(lambda x: " ".join([word for word in x.split() if word not in w_freq]), train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,father dysfunctional selfish drags kids dysfun...
1,2,0,thanks lyft credit cant use cause dont offer w...
2,3,0,bihday majesty
3,4,0,model take urð ðððð ððð
4,5,0,factsguide society motivation


In [7]:
# rare word removal
# last 10 frequent word
w_freq = pd.Series(' '.join(train["tweet"]).split()).value_counts()[-10:]
train["tweet"] = list(map(lambda x: " ".join([word for word in x.split() if word not in w_freq]), train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,father dysfunctional selfish drags kids dysfun...
1,2,0,thanks lyft credit cant use cause dont offer w...
2,3,0,bihday majesty
3,4,0,model take urð ðððð ððð
4,5,0,factsguide society motivation


In [8]:
# remove non-ascii words
import re
re.sub(r'[^\x00-\x7F]+','', train["tweet"][3])
train["tweet"] = list(map(lambda x: re.sub(r'[^\x00-\x7F]+','',x),train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,father dysfunctional selfish drags kids dysfun...
1,2,0,thanks lyft credit cant use cause dont offer w...
2,3,0,bihday majesty
3,4,0,model take ur
4,5,0,factsguide society motivation


In [9]:
# remove whitespace from text

train["tweet"]=list(map(lambda x: x.strip(),train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,father dysfunctional selfish drags kids dysfun...
1,2,0,thanks lyft credit cant use cause dont offer w...
2,3,0,bihday majesty
3,4,0,model take ur
4,5,0,factsguide society motivation


In [14]:
# spelling correction
from textblob import TextBlob
train["tweet"] = list(map(lambda x: str(TextBlob(x).correct()),train["tweet"]))
train.head()

In [19]:
# difference between tokenization and spliting
from textblob import TextBlob
a = 'hii, ,hello '
print(TextBlob(a).words)
print(a.split())

['hii', 'hello']
['hii,', ',hello']


In [10]:
# stemming text data
# stemming is removal of suffices like "ing","s","ly" etc.
from nltk.stem import PorterStemmer
st = PorterStemmer()
train["tweet"] = list(map(lambda x: ' '.join([st.stem(word) for word in x.split()]),train["tweet"]))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,father dysfunct selfish drag kid dysfunct run
1,2,0,thank lyft credit cant use caus dont offer whe...
2,3,0,bihday majesti
3,4,0,model take ur
4,5,0,factsguid societi motiv


In [40]:
# pos(parts of speech tagging) tagging
from nltk import pos_tag, word_tokenize
pos_tag(word_tokenize(train["tweet"][0]))

[('father', 'RB'),
 ('dysfunct', 'JJ'),
 ('selfish', 'JJ'),
 ('drag', 'NN'),
 ('kid', 'NN'),
 ('dysfunct', 'NN'),
 ('run', 'VB')]

In [41]:
# lemmetization
# it not only remove suffices also convert word to root word
#from nltk.stem import WordNetLemmatizer
#from nltk import pos_tag, word_tokenize
#lemmatizer = WordNetLemmatizer()
#sent = 'This is a foo bar sentence'
#for word in TextBlob(sent).words:
#    print(lemmatizer.lemmatize(word,'v'))

In [4]:
# classification using naive bayes' algorithm

In [25]:
train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()

In [12]:
result = vec.fit_transform(train['tweet'])

In [13]:
#print(vec.get_feature_names())
print(result.shape)
type(result)
#print (result.get_params())

(31962, 35202)


scipy.sparse.csr.csr_matrix

In [14]:
# logistic regression  
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [15]:
target = train['label']

In [16]:
x_train,x_test, y_train,y_test = train_test_split(result,target,test_size = 0.25)

In [43]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [44]:
 accuracy_score(y_test, lr.predict(x_test))

0.9455637592291328

In [45]:
x = lr.predict(x_test)

In [46]:
y_test.value_counts()

0    7443
1     548
Name: label, dtype: int64

In [47]:
x

array([0, 0, 0, ..., 0, 0, 0])

In [48]:
import numpy as np
np.count_nonzero(x)

743

In [49]:
from sklearn.metrics import f1_score
f1_score(y_test, lr.predict(x_test))

0.6630518977536793

In [27]:
feature_to_coef = {
    word: coef for word, coef in zip(
        vec.get_feature_names(), lr.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
print("=====================================================")
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('allahsoil', 19.053012299359807)
('racism', 13.45135685562142)
('white', 13.252946180006086)
('bigot', 12.137621068048455)
('misogyni', 10.366182384309985)
('bihday', -9.673165960442272)
('orlando', -9.5884196994643)
('life', -7.947428813315022)
('miscegen', -7.224057485500922)
('tragedi', -6.671286292107005)
