# Extracting text features (count of stopwords, mentions, hashtags, words, characters)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("tweets_labeled.csv")
df.head(10)

Unnamed: 0,tweet_id,text,label
0,1161040537207463936,'RT @SenJeffMerkley: The Endangered Species Ac...,1
1,1176360756239118342,'RT @LindseyGrahamSC: Interesting concept -- i...,1
2,1099036648573145088,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,0
3,1092915693203480577,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,0
4,1149038450668187654,'RT @TheOnion: Sweden Announces Plan To Get 10...,0
5,1175456815674343424,'RT @kcResister: Warren: 'Congress is complici...,1
6,1180809117310623744,'RT @ShimonPro: A dozen current and former sta...,1
7,1179840318935576578,'RT @samstein: This is a bombshell that isn’t ...,1
8,1081722778125062144,'Planned Parenthood Erects Billboards Urging W...,0
9,1158761795739217921,'https://t.co/MvrznF1fWVWhoever obstructing th...,1


### Feature to count amount of words + head()

In [8]:
def show_word_count(df):
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))
    return df[['text','word_count']].head()

In [9]:
show_word_count(df) #shows word count per tweet

Unnamed: 0,text,word_count
0,'RT @SenJeffMerkley: The Endangered Species Ac...,23
1,'RT @LindseyGrahamSC: Interesting concept -- i...,22
2,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,6
3,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,16
4,'RT @TheOnion: Sweden Announces Plan To Get 10...,20


### Feature for counting the characters

In [10]:
def show_char_count(df):
    df['char_count'] = df['text'].str.len() ## this also includes spaces
    return df[['text','word_count','char_count']].head()

In [11]:
show_char_count(df)

Unnamed: 0,text,word_count,char_count
0,'RT @SenJeffMerkley: The Endangered Species Ac...,23,141
1,'RT @LindseyGrahamSC: Interesting concept -- i...,22,140
2,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,6,75
3,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,16,113
4,'RT @TheOnion: Sweden Announces Plan To Get 10...,20,142


### Retrieve the Average Word Length

Here we create a feature which can calculate what the average length of words in each tweet is. To do so, we take the sum of the length of aal the words in that tweet and devide that number by the total length of the tweet.

In [13]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

df['avg_word'] = df['text'].apply(lambda x: avg_word(x))
df[['text','word_count', 'char_count','avg_word']].head()

Unnamed: 0,text,word_count,char_count,avg_word
0,'RT @SenJeffMerkley: The Endangered Species Ac...,23,141,5.173913
1,'RT @LindseyGrahamSC: Interesting concept -- i...,22,140,5.95
2,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,6,75,14.0
3,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,16,113,6.125
4,'RT @TheOnion: Sweden Announces Plan To Get 10...,20,142,6.15


### Number of stopwords

The next feature revolves around stopwords. To get some extra information about the tweets texts we retrieve the amount of stopwords per tweet. First we need to import the English stopwords from NLTK, which is a Natural Language Processing library for Python.

In [15]:
from nltk.corpus import stopwords

def stopwords_count(df):
    stop = stopwords.words('english')
    df['stopwords'] = df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
    return df[['text','stopwords']].head()

stopwords_count(df)

Unnamed: 0,text,stopwords
0,'RT @SenJeffMerkley: The Endangered Species Ac...,7
1,'RT @LindseyGrahamSC: Interesting concept -- i...,1
2,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,0
3,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,4
4,'RT @TheOnion: Sweden Announces Plan To Get 10...,0


In [16]:
df.head() #Just to check info

Unnamed: 0,tweet_id,text,label,word_count,char_count,avg_word,stopwords
0,1161040537207463936,'RT @SenJeffMerkley: The Endangered Species Ac...,1,23,141,5.173913,7
1,1176360756239118342,'RT @LindseyGrahamSC: Interesting concept -- i...,1,22,140,5.95,1
2,1099036648573145088,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,0,6,75,14.0,0
3,1092915693203480577,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,0,16,113,6.125,4
4,1149038450668187654,'RT @TheOnion: Sweden Announces Plan To Get 10...,0,20,142,6.15,0


### A count of special characters

To extract the number of special characters used in each tweet, we can create a new functions that do the job. For example the usage of hashtags or the mentions can be counted. 

To retrieve the hashtags and mentions we use the 'starts with' function because these special characters used for hashtags(#) and mentions(@) are always at the beginning of a word. 

In [19]:
def count_hashtags(df):
    df['hastags'] = df['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    return df[['text','hastags']].head()

def count_mentions(df):
    df['mentions'] = df['text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
    return df[['text','mentions']].head()

count_hashtags(df)

Unnamed: 0,text,hastags
0,'RT @SenJeffMerkley: The Endangered Species Ac...,0
1,'RT @LindseyGrahamSC: Interesting concept -- i...,0
2,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,2
3,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,0
4,'RT @TheOnion: Sweden Announces Plan To Get 10...,0


In [20]:
count_mentions(df)

Unnamed: 0,text,mentions
0,'RT @SenJeffMerkley: The Endangered Species Ac...,1
1,'RT @LindseyGrahamSC: Interesting concept -- i...,1
2,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,1
3,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,1
4,'RT @TheOnion: Sweden Announces Plan To Get 10...,1


In [22]:
df.head() #get an overview of dataset

Unnamed: 0,tweet_id,text,label,word_count,char_count,avg_word,stopwords,hastags,mentions
0,1161040537207463936,'RT @SenJeffMerkley: The Endangered Species Ac...,1,23,141,5.173913,7,0,1
1,1176360756239118342,'RT @LindseyGrahamSC: Interesting concept -- i...,1,22,140,5.95,1,0,1
2,1099036648573145088,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,0,6,75,14.0,0,2,1
3,1092915693203480577,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,0,16,113,6.125,4,0,1
4,1149038450668187654,'RT @TheOnion: Sweden Announces Plan To Get 10...,0,20,142,6.15,0,0,1
