## Cyberbullying Dataset Exploration

### Read in the dataset

In [32]:
import pandas as pd

# This dataset is based on the paper:
# "Using Machine Learning to Detect Cyberbullying"
# In Proceedings of the 2011 10th International Conference on Machine Learning and Applications Workshops
# Reynolds, K; Kontostathis, A.; Edwards, L., December 2011.
data = pd.read_csv('datasets/formspring-data.csv', header=None)
data.columns = ['label_bullying', 'text_message']

data.head()

Unnamed: 0,label_bullying,text_message
0,0,I like too many songs to have a favorite
1,0,</3 ? haha jk! <33
2,0,Really?!?! Thanks?! haha
3,0,;(
4,0,*RAWR*?


### Explore the dataset

In [33]:
# What is the shape of the dataset?
print("Input data has {} rows and {} columns".format(len(data), len(data.columns)))

Input data has 13159 rows and 2 columns


In [34]:
# How many bullying/non bullying are there?
print("Out of {} rows, {} are bullying, {} are not bullying".format(len(data),
                                                       len(data[data['label_bullying']==1]),
                                                       len(data[data['label_bullying']==0])))

Out of 13159 rows, 2054 are bullying, 11105 are not bullying


In [35]:
# How much missing data is there?
print("Number of null in label: {}".format(data['label_bullying'].isnull().sum()))
print("Number of null in text: {}".format(data['text_message'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


### Cleaning the dataset

In [37]:
# Remove punctutation
import string

def remove_punctuation(text):
    textNoPunct = "".join([char for char in text if char not in string.punctuation])
    return textNoPunct

data['text_message_clean'] = data['text_message'].apply(lambda x: remove_punctuation(x))

data.head()

Unnamed: 0,label_bullying,text_message,text_message_clean
0,0,I like too many songs to have a favorite,I like too many songs to have a favorite
1,0,</3 ? haha jk! <33,3 haha jk 33
2,0,Really?!?! Thanks?! haha,Really Thanks haha
3,0,;(,
4,0,*RAWR*?,RAWR


### Tokenization

In [38]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

# Lowercase all tokens here also
data['text_message_tokens'] = data['text_message_clean'].apply(lambda x: tokenize(x.lower()))

data.head()

Unnamed: 0,label_bullying,text_message,text_message_clean,text_message_tokens
0,0,I like too many songs to have a favorite,I like too many songs to have a favorite,"[i, like, too, many, songs, to, have, a, favor..."
1,0,</3 ? haha jk! <33,3 haha jk 33,"[3, haha, jk, 33]"
2,0,Really?!?! Thanks?! haha,Really Thanks haha,"[really, thanks, haha]"
3,0,;(,,[]
4,0,*RAWR*?,RAWR,[rawr]


### Removing Stopwords

In [39]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    withoutStopwords = [word for word in tokenized_list if word not in stopwords]
    return withoutStopwords

data['text_no_stopwords'] = data['text_message_tokens'].apply(lambda x: remove_stopwords(x))

data.head()

Unnamed: 0,label_bullying,text_message,text_message_clean,text_message_tokens,text_no_stopwords
0,0,I like too many songs to have a favorite,I like too many songs to have a favorite,"[i, like, too, many, songs, to, have, a, favor...","[like, many, songs, favorite]"
1,0,</3 ? haha jk! <33,3 haha jk 33,"[3, haha, jk, 33]","[3, haha, jk, 33]"
2,0,Really?!?! Thanks?! haha,Really Thanks haha,"[really, thanks, haha]","[really, thanks, haha]"
3,0,;(,,[],[]
4,0,*RAWR*?,RAWR,[rawr],[rawr]


### Stemming

In [40]:
ps = nltk.PorterStemmer()

def stemming(tokenized_list):
    text = [ps.stem(word) for word in tokenized_list]
    return text

data['stemmed_text'] = data['text_no_stopwords'].apply(lambda x: stemming(x))

data.head()

Unnamed: 0,label_bullying,text_message,text_message_clean,text_message_tokens,text_no_stopwords,stemmed_text
0,0,I like too many songs to have a favorite,I like too many songs to have a favorite,"[i, like, too, many, songs, to, have, a, favor...","[like, many, songs, favorite]","[like, mani, song, favorit]"
1,0,</3 ? haha jk! <33,3 haha jk 33,"[3, haha, jk, 33]","[3, haha, jk, 33]","[3, haha, jk, 33]"
2,0,Really?!?! Thanks?! haha,Really Thanks haha,"[really, thanks, haha]","[really, thanks, haha]","[realli, thank, haha]"
3,0,;(,,[],[],[]
4,0,*RAWR*?,RAWR,[rawr],[rawr],[rawr]


### Lemmatization

In [60]:
wn = nltk.WordNetLemmatizer()
# Note you may need to run nltk.download('wordnet') to download the wordnet resources

def lemmatize(tokenized_list):
    text = [wn.lemmatize(word) for word in tokenized_list]
    return text

data['lemmatized_text'] = data['text_no_stopwords'].apply(lambda x: lemmatize(x))

data.head()

Unnamed: 0,label_bullying,text_message,text_message_clean,text_message_tokens,text_no_stopwords,stemmed_text,lemmatized_text
0,0,I like too many songs to have a favorite,I like too many songs to have a favorite,"[i, like, too, many, songs, to, have, a, favor...","[like, many, songs, favorite]","[like, mani, song, favorit]","[like, many, song, favorite]"
1,0,</3 ? haha jk! <33,3 haha jk 33,"[3, haha, jk, 33]","[3, haha, jk, 33]","[3, haha, jk, 33]","[3, haha, jk, 33]"
2,0,Really?!?! Thanks?! haha,Really Thanks haha,"[really, thanks, haha]","[really, thanks, haha]","[realli, thank, haha]","[really, thanks, haha]"
3,0,;(,,[],[],[],[]
4,0,*RAWR*?,RAWR,[rawr],[rawr],[rawr],[rawr]


### Count Vectorization

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

x_counts = count_vect.fit_transform(data['text_message_clean'])
#print(x_counts.shape)
#print(count_vect.get_feature_names())

x_counts_df = pd.DataFrame(x_counts.toarray())
x_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13008,13009,13010,13011,13012,13013,13014,13015,13016,13017
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
x_counts_df.columns = count_vect.get_feature_names()
x_counts_df

Unnamed: 0,00,000,0o,10,100,1000,10000,1000000000,1011,1015,...,zip,zobliviously,zombie,zombies,zone,zony,zoo,zoos,zoowill,zoowills
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
