# Word features

This document contains features to:
- count the amount of words in a tweet (excluding mentions and websites) (int)
- if a tweet contains a link (1/0)
- How much characters the average word contains, excluding mentions and websites (int)
- How big the word with most characters is, excluding mentions and websites (int)


## Import packages

In [1]:
import pandas as pd
import numpy as np

## Functions

In [5]:
def clean_words(word):
    word = word.lower()
    word = word.replace(",", "")
    word = word.replace(".", "")
    word = word.replace(":", "")
    word = word.replace("?", "")
    word = word.replace("#", "")
    word = word.replace("(", "")
    word = word.replace(")", "")
    word = word.replace("!", "")
    word = word.replace("'", "")
    word = word.replace(";", "")
    word = word.replace("&", "")
    word = word.replace("'", "")
    return word

def word_count(tweet):
    #set defaults to 0
    word_count = 0
    #for each word in the tweet that is received as input
    for w in tweet.split():
        #if the word is a mention or a link we don't count them
        if '@' in w or 'http' in w:
            pass
        else:
            #add 1 to the count of the amount of words
            word_count +=1

    #return the word count        
    return word_count

def contains_link(tweet):
    #if a tweet contains a link (has 'http' in it) return 1 (true) otherwise return 0 (false)
    if 'http' in tweet:
        return 1
    else:
        return 0

def average_word_length(tweet):
    #set defaults to 0
    word_count = 0
    word_length = 0

    #for each word in the tweet that is received as input
    for w in tweet.split():
        #if the word is a mention or a link we don't count them
        if '@' in w or 'http' in w:
            pass
        else:
            #clean the word, remove hashtags, punctuation, etc.
            w = clean_words(w)
            #add the length of this word to the total
            word_length = word_length + len(w)
            #add 1 to the count of the amount of words
            word_count +=1
    #calculate average by dividing word length with the amount of words
    if word_count != 0 and word_length != 0:
        average_word_length = word_length/word_count
    else:
        average_word_length = 0
    #returning the average
    return average_word_length

def longest_word(tweet):
    #set default to 0
    longest_word = 0

    #for each word in the tweet that is received as input
    for w in tweet.split():
        #if the word is a mention or a link we don't count them
        if '@' in w or 'http' in w:
            pass
        else:
            #clean the word, remove hashtags, punctuation, etc.
            w = clean_words(w)
            #calculate the length of the word (amount of characters)
            word_length = len(w)

            #if this word is the longest word replace the length of the longest word
            if word_length > longest_word:
                longest_word = word_length
            else:
                pass
    #after for loop return result of the longest word
    return longest_word

## Import DataFrame

In [6]:
#DataFrame
df = pd.read_csv('tweets_labeled.csv', index_col=0)
df.head()

Unnamed: 0_level_0,text,label
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1161040537207463936,'RT @SenJeffMerkley: The Endangered Species Ac...,1
1176360756239118342,'RT @LindseyGrahamSC: Interesting concept -- i...,1
1099036648573145088,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,0
1092915693203480577,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,0
1149038450668187654,'RT @TheOnion: Sweden Announces Plan To Get 10...,0


## Apply functions

In [7]:
df['average_word_length'] = df['text'].apply(average_word_length)
df['longest_word'] = df['text'].apply(longest_word)
df['word_count'] = df['text'].apply(word_count)
df['contains_link'] = df['text'].apply(contains_link)
df.head()

Unnamed: 0_level_0,text,label,average_word_length,longest_word,word_count,contains_link
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1161040537207463936,'RT @SenJeffMerkley: The Endangered Species Ac...,1,4.545455,10,22,0
1176360756239118342,'RT @LindseyGrahamSC: Interesting concept -- i...,1,5.105263,11,19,0
1099036648573145088,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,0,9.0,13,3,1
1092915693203480577,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,0,4.0,7,14,1
1149038450668187654,'RT @TheOnion: Sweden Announces Plan To Get 10...,0,4.470588,9,17,1


## Check out correlations

In [8]:
corr = df.corr()
corr

Unnamed: 0,label,average_word_length,longest_word,word_count,contains_link
label,1.0,-0.025178,0.176597,0.446694,-0.681751
average_word_length,-0.025178,1.0,0.642012,-0.130802,0.047443
longest_word,0.176597,0.642012,1.0,0.227245,-0.18941
word_count,0.446694,-0.130802,0.227245,1.0,-0.64504
contains_link,-0.681751,0.047443,-0.18941,-0.64504,1.0
