In [57]:
from zipfile import ZipFile
import numpy as np

import pandas as pd
import nltk 
from nltk.corpus import stopwords
import re
import spacy
import unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
#from contractions import CONTRACT_MAP

###### 1. Load the dataset  (5 points)

In [2]:
filename = "blog-authorship-corpus.zip"

In [9]:
df = pd.read_csv(filename, compression='zip', header=0, sep=',', quotechar='"')

In [10]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [12]:
df.age.value_counts()

17    80859
24    80071
23    72889
16    72708
25    67051
26    55312
27    46124
15    41767
14    27400
34    21347
33    17584
35    17462
36    14229
13    13133
37     9317
38     7545
39     5556
40     5016
45     4482
43     4230
41     3738
48     3572
42     2908
46     2733
47     2207
44     2044
Name: age, dtype: int64

In [35]:
text = df['text']

In [36]:
len(text)

681284

In [58]:
sorted(set(text))

['    ',
 '     ',
 '      ',
 '       ',
 "       \x02 second day of driver ed. It was insanely torcherous. We were couped up in the classroom for 4 stinking hours without a break because he forgot to give us one..or two..I was seriously about to kill someone. we didn't even talk about driving most of the time!!! On the bright side...this whole thing will be done in a month and I can have a summer!! It has been raining here. And cold...like winter...without snow. I want it to be summer!!! I am sick of rain. I want it to be warm. Maybe next week...         ",
 "       \x16Post.  This is my blog.  Yo.  Yo.  Kick it DJ, spin dat beat. I left Florida Friday night and got to Dubuque, Iowa Saturday night.  I met Sara Jean at The Village Inn on Dodge St. and we drank coffee and had french toast.  It was good.  Then we went to her house, showered up and went to a Salsa House Party.  Salsa as in dancing, not a mexican dip party.  The party was wild and out of control and huge.  So many sexy wo

In [38]:
text

0                    Info has been found (+/- 100 pages,...
1                    These are the team members:   Drewe...
2                    In het kader van kernfusie op aarde...
3                          testing!!!  testing!!!          
4                      Thanks to Yahoo!'s Toolbar I can ...
                                ...                        
681279           Dear Susan,  I could write some really ...
681280           Dear Susan,  'I have the second yeast i...
681281           Dear Susan,  Your 'boyfriend' is fuckin...
681282           Dear Susan:    Just to clarify, I am as...
681283           Hey everybody...and Susan,  You might a...
Name: text, Length: 681284, dtype: object

###### 2. Preprocess rows of the “text” column (7.5 points)
a. Remove unwanted characters
b. Convert text to lowercase
c. Remove unwanted spaces
d. Remove stopwords


In [45]:
def remove_accented_chars(txt):
    txt=unicodedata.normalize('NFKD',txt).encode('ascii','ignore').decode('utf-8','ignore')
    return txt

In [84]:
def remove_special_chars(txt, remove_digits=True):
    pattern = r'[^a-zA-Z\s]'
    txt = re.sub(pattern," ",txt)
    return txt

In [68]:
def lower_chars(txt):
    txt=txt.lower()
    return txt

In [77]:
def remove_extra_spaces(txt):
    txt=" ".join(txt.split())
    return txt

In [86]:
tokenizer = ToktokTokenizer()
stopword_list = stopwords.words('English')
stopword_list.remove('no')
stopword_list.remove('not')

def remove_stop_words(txt):
    tokens = tokenizer.tokenize(txt)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text  

In [19]:
stopwords.words('English')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

Example of applying all pre processing. We can see that all korean letters has been removed from output along with special chars, extra spaces, stop words. 

In [52]:
text[70]

"             Ya, I'm off to Canada/Vancouver again soon...ah, to be back in the land of fresh air, real mountains and diapers. Luckily, I think my oldest boy is out potty-trained now. Not sure, though, haven't been home for about 2 months...and a lot can happen in that time.  Now that we've done a deal here my boss/CEO would have me here all of August (well, all of the year, actually) but I have to go back as it's my younger son's first birthday, or  urlLink 첫돌/Chot-dol  (they have a different name for almost everything here). For Koreans the 1st and 60th birthdays are the biggies. 1st because the child made it through the often-treacherous first year (remember, Korea used to be really, really poor...so much so that is was a 100-days celebration as well, but that is starting to become less important as more babies are living past it). 60th because the  urlLink Korean Zodiac  is similar to the Chinese one...with 12 animals (ya, I'm a pig). There are also five colors which elude me at t

In [92]:
remove_stop_words(remove_extra_spaces(lower_chars(remove_special_chars(remove_accented_chars(text[70])))))

'ya canada vancouver soon ah back land fresh air real mountains diapers luckily think oldest boy potty trained not sure though home months lot happen time done deal boss ceo would august well year actually go back younger son first birthday urllink chot dol different name almost everything koreans st th birthdays biggies st child made often treacherous first year remember korea used really really poor much days celebration well starting become less important babies living past th urllink korean zodiac similar chinese one animals ya pig also five colors elude moment although know two black gold chinese use five elements metal earth wood fire water year urllink year wood monkey guess better urllink year wood cock rooster hehe put anyways age gone years times colors elements truly magical time guess thus age men no mention women least one grandchild preferably grandson one reason happy oldest boy wife father th year foreingers call non koreans even canada reason korea called urllink hermi

In [93]:
clean_text = []
for i in range(len(text)):
    filtered = remove_stop_words(remove_extra_spaces(lower_chars(remove_special_chars(remove_accented_chars(text[i])))))
    clean_text.append(filtered)

In [95]:
clean_text

['info found pages mb pdf files wait untill team leader processed learns html',
 'team members drewes van der laag urllink mail ruiyu xie urllink mail bryan aaldering urllink mail',
 'het kader van kernfusie op aarde maak je eigen waterstofbom build h bomb ascott tartarus uwa edu au andrew scott newsgroups rec humor subject build h bomb humorous date feb gmt organization university western australia original file dated th november seemed transcript seven days article poorly formatted corrupted added text examine microscope malleable like gold missing anyone full text please distribute not responsible accuracy information converted html dionisio infinet com little spell checking minor edits stolen urllink http ohio voyager net dionisio fun h bomb html reformatted html validates xhtml strict build h bomb making owning h bomb kind challenge real americans seek wants passive victim nuclear war little effort active participant bomb shelters losers wants huddle together underground eating ca

###### 3. As we want to make this into a multi-label classification problem, you are required to merge
all the label columns together, so that we have all the labels together for a particular sentence
(7.5 points)
a. Label columns to merge: “gender”, “age”, “topic”, “sign”
b. After completing the previous step, there should be only two columns in your data
frame i.e. “text” and “labels” as shown in the below image

In [129]:
df['age']=df['age'].astype('object')

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  object
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(1), object(6)
memory usage: 36.4+ MB


In [96]:
final_df = pd.DataFrame(columns=['text','labels'])

In [142]:
labels = '[' + df['gender'] + ', '+  df['topic'] + ', ' + df['sign'] + ']'
#df1.State.str.cat(df1.State_code)

In [143]:
final_df['text']=clean_text
final_df['labels']=labels

In [144]:
final_df.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, Student, Leo]"
3,testing testing,"[male, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, InvestmentBanking, Aquarius]"
