In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [131]:
corpus = pd.read_csv('blogtext.csv')

In [132]:
corpus.shape

(681284, 7)

In [133]:
corpus.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [134]:
corpus = corpus.sample(2000)

In [136]:
corpus.shape

(2000, 7)

In [137]:
corpus.topic.value_counts()

indUnk                     741
Student                    459
Technology                 120
Arts                        92
Education                   87
Communications-Media        64
Internet                    51
Non-Profit                  46
Engineering                 40
Publishing                  27
Law                         24
Science                     22
Religion                    21
Marketing                   18
Fashion                     17
Consulting                  16
Government                  15
Advertising                 14
Telecommunications          12
Sports-Recreation           11
BusinessServices            11
Museums-Libraries            9
Accounting                   9
HumanResources               9
Banking                      8
Transportation               8
RealEstate                   7
Chemicals                    7
Military                     5
Environment                  4
Architecture                 4
Tourism                      4
Biotech 

In [138]:
#Lower casing
corpus['text'] = corpus['text'].apply( lambda t : ' '.join( word.lower() for word in t.split()  ) )
corpus['text'].head(5)

231219    egh, that&nbsp;template is badness! aarrrrgggg...
44608     hiya people, yes it's me, and i have nothing t...
394273    something absolutely suprising, crazy, hott, a...
227389    plodding through a very thick, forest, a boy w...
268847    are you still in cali? i will be in san fran s...
Name: text, dtype: object

In [139]:
## 2. Remove punctuations
corpus['text'] = corpus['text'].str.replace( '[^\w\s]' , '' )
corpus['text'].head(10)

231219    egh thatnbsptemplate is badness aarrrrggggghhh...
44608     hiya people yes its me and i have nothing to w...
394273    something absolutely suprising crazy hott and ...
227389    plodding through a very thick forest a boy was...
268847    are you still in cali i will be in san fran su...
561370    i had one last resort to scare myselfi rented ...
196159    its a return to normalcy in the words of a cer...
474153    i broke my specs not deliberately of course i ...
45814     omgosh kanakuk is only 2 weeks away i am so ex...
306196    jerks should die well lets see its been an int...
Name: text, dtype: object

In [140]:

stop = stopwords.words('english')

corpus['text'] = corpus['text'].apply( lambda t : " ".join( word for word in t.split() 
                                                       if word not in stop ) )

In [141]:
corpus['text'].head(10)

231219    egh thatnbsptemplate badness aarrrrggggghhhhh ...
44608     hiya people yes nothing write except fact pare...
394273    something absolutely suprising crazy hott scan...
227389    plodding thick forest boy relaxing mind hustle...
268847    still cali san fran sun thru tue want place cr...
561370    one last resort scare myselfi rented silent hi...
196159    return normalcy words certain deceased preside...
474153    broke specs deliberately course remember advis...
45814     omgosh kanakuk 2 weeks away excited heatehrs b...
306196    jerks die well lets see interesting couple day...
Name: text, dtype: object

In [142]:
# 4. Frequent word removal from the text

words = ' '.join(corpus['text'] ).split()
freqeuncy = pd.Series(words).value_counts()[:40]

corpus['text'] = corpus['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in freqeuncy) )
corpus['text'].head(10)

231219    egh thatnbsptemplate badness aarrrrggggghhhhh ...
44608     hiya yes nothing write except fact parents alm...
394273    absolutely suprising crazy hott scandalous hap...
227389    plodding thick forest boy relaxing mind hustle...
268847    cali san fran sun thru tue place crashwill abl...
561370    resort scare myselfi rented silent hill 2 alon...
196159    return normalcy words certain deceased preside...
474153    broke specs deliberately course remember advis...
45814     omgosh kanakuk 2 weeks away excited heatehrs b...
306196    jerks die lets interesting couple days ya ago ...
Name: text, dtype: object

In [143]:
# 5. Remove Rare words
words = ' '.join(corpus['text'] ).split()
rare_words = pd.Series(words).value_counts()[-1800:]
rare_words.sort_values

<bound method Series.sort_values of unwitting                                                                                                                                                                                                                                                                                                                                                                                                                                                           1
kristi                                                                                                                                                                                                                                                                                                                                                                                                                                                              1
houssseee                                               

In [144]:
corpus['text'] = corpus['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in rare_words) )
corpus['text'].head(10)

231219    egh thatnbsptemplate badness aarrrrggggghhhhh ...
44608     hiya yes nothing write except fact parents alm...
394273    absolutely suprising crazy hott scandalous hap...
227389    plodding thick forest boy relaxing mind hustle...
268847    cali san fran sun thru tue place crashwill abl...
561370    resort scare myselfi rented silent hill 2 alon...
196159    return normalcy words certain deceased preside...
474153    broke specs deliberately course remember advis...
45814     omgosh kanakuk 2 weeks away excited heatehrs b...
306196    jerks die lets interesting couple days ya ago ...
Name: text, dtype: object

In [145]:
# 6. Remove extra spaces
corpus['text'] = corpus['text'].str.strip()
corpus['text'].head(10)

231219    egh thatnbsptemplate badness aarrrrggggghhhhh ...
44608     hiya yes nothing write except fact parents alm...
394273    absolutely suprising crazy hott scandalous hap...
227389    plodding thick forest boy relaxing mind hustle...
268847    cali san fran sun thru tue place crashwill abl...
561370    resort scare myselfi rented silent hill 2 alon...
196159    return normalcy words certain deceased preside...
474153    broke specs deliberately course remember advis...
45814     omgosh kanakuk 2 weeks away excited heatehrs b...
306196    jerks die lets interesting couple days ya ago ...
Name: text, dtype: object

In [146]:
corpus.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
231219,3122872,male,15,Student,Libra,"17,July,2004",egh thatnbsptemplate badness aarrrrggggghhhhh ...
44608,1098541,male,15,indUnk,Capricorn,"15,February,2004",hiya yes nothing write except fact parents alm...
394273,3092903,female,15,Student,Virgo,"17,July,2004",absolutely suprising crazy hott scandalous hap...
227389,3838040,male,17,Engineering,Cancer,"08,July,2004",plodding thick forest boy relaxing mind hustle...
268847,1325355,female,26,indUnk,Cancer,"27,September,2003",cali san fran sun thru tue place crashwill abl...


In [147]:

corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 231219 to 156795
Data columns (total 7 columns):
id        2000 non-null int64
gender    2000 non-null object
age       2000 non-null int64
topic     2000 non-null object
sign      2000 non-null object
date      2000 non-null object
text      2000 non-null object
dtypes: int64(2), object(5)
memory usage: 125.0+ KB


In [148]:
sub_corpus = corpus.drop(['id','text','date'], axis = 1)


In [149]:
corpus.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
231219,3122872,male,15,Student,Libra,"17,July,2004",egh thatnbsptemplate badness aarrrrggggghhhhh ...
44608,1098541,male,15,indUnk,Capricorn,"15,February,2004",hiya yes nothing write except fact parents alm...
394273,3092903,female,15,Student,Virgo,"17,July,2004",absolutely suprising crazy hott scandalous hap...
227389,3838040,male,17,Engineering,Cancer,"08,July,2004",plodding thick forest boy relaxing mind hustle...
268847,1325355,female,26,indUnk,Cancer,"27,September,2003",cali san fran sun thru tue place crashwill abl...


In [150]:
corpus['labels']=sub_corpus.values.tolist()

In [151]:
corpus.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
231219,3122872,male,15,Student,Libra,"17,July,2004",egh thatnbsptemplate badness aarrrrggggghhhhh ...,"[male, 15, Student, Libra]"
44608,1098541,male,15,indUnk,Capricorn,"15,February,2004",hiya yes nothing write except fact parents alm...,"[male, 15, indUnk, Capricorn]"
394273,3092903,female,15,Student,Virgo,"17,July,2004",absolutely suprising crazy hott scandalous hap...,"[female, 15, Student, Virgo]"
227389,3838040,male,17,Engineering,Cancer,"08,July,2004",plodding thick forest boy relaxing mind hustle...,"[male, 17, Engineering, Cancer]"
268847,1325355,female,26,indUnk,Cancer,"27,September,2003",cali san fran sun thru tue place crashwill abl...,"[female, 26, indUnk, Cancer]"


In [152]:
sub_corpus.head()

Unnamed: 0,gender,age,topic,sign
231219,male,15,Student,Libra
44608,male,15,indUnk,Capricorn
394273,female,15,Student,Virgo
227389,male,17,Engineering,Cancer
268847,female,26,indUnk,Cancer


In [153]:
corpus.drop(['id','gender','topic','sign','date'], axis = 1, inplace = True)

In [154]:
corpus.head()

Unnamed: 0,age,text,labels
231219,15,egh thatnbsptemplate badness aarrrrggggghhhhh ...,"[male, 15, Student, Libra]"
44608,15,hiya yes nothing write except fact parents alm...,"[male, 15, indUnk, Capricorn]"
394273,15,absolutely suprising crazy hott scandalous hap...,"[female, 15, Student, Virgo]"
227389,17,plodding thick forest boy relaxing mind hustle...,"[male, 17, Engineering, Cancer]"
268847,26,cali san fran sun thru tue place crashwill abl...,"[female, 26, indUnk, Cancer]"


In [155]:
corpus.drop('age', axis = 1, inplace = True)

In [156]:
corpus.head()

Unnamed: 0,text,labels
231219,egh thatnbsptemplate badness aarrrrggggghhhhh ...,"[male, 15, Student, Libra]"
44608,hiya yes nothing write except fact parents alm...,"[male, 15, indUnk, Capricorn]"
394273,absolutely suprising crazy hott scandalous hap...,"[female, 15, Student, Virgo]"
227389,plodding thick forest boy relaxing mind hustle...,"[male, 17, Engineering, Cancer]"
268847,cali san fran sun thru tue place crashwill abl...,"[female, 26, indUnk, Cancer]"


In [157]:
np.unique(corpus.labels).shape

(969,)

In [158]:
features = corpus['text']
labels = corpus['labels']

In [159]:

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25
                                                   )

In [160]:
X_train.shape

(1500,)

In [161]:
X_test.shape

(500,)

In [162]:
vect = CountVectorizer(ngram_range=(1, 2))

In [163]:
vect.fit(features)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [164]:
X_train_dtm = vect.transform(X_train)
X_train_dtm


<1500x179708 sparse matrix of type '<class 'numpy.int64'>'
	with 218218 stored elements in Compressed Sparse Row format>

In [165]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<500x179708 sparse matrix of type '<class 'numpy.int64'>'
	with 77675 stored elements in Compressed Sparse Row format>

In [166]:
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [167]:
X_test_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [168]:
sub_corpus.head()

Unnamed: 0,gender,age,topic,sign
231219,male,15,Student,Libra
44608,male,15,indUnk,Capricorn
394273,female,15,Student,Virgo
227389,male,17,Engineering,Cancer
268847,female,26,indUnk,Cancer


In [169]:
dummy = sub_corpus.age.value_counts()




In [170]:
sample_age = dict(zip(dummy.index, dummy.values))

In [171]:
sample_age

{24: 239,
 17: 234,
 23: 229,
 16: 214,
 25: 191,
 27: 148,
 26: 147,
 15: 121,
 14: 81,
 33: 65,
 35: 50,
 34: 50,
 13: 38,
 36: 36,
 38: 28,
 37: 23,
 39: 18,
 48: 16,
 42: 12,
 40: 11,
 41: 10,
 43: 10,
 46: 10,
 47: 8,
 45: 6,
 44: 5}

In [172]:
dummy = sub_corpus.gender.value_counts()
sample_gender = dict(zip(dummy.index, dummy.values))
sample_gender

{'male': 1032, 'female': 968}

In [173]:
dummy = sub_corpus.topic.value_counts()
sample_topic = dict(zip(dummy.index, dummy.values))
sample_topic

{'indUnk': 741,
 'Student': 459,
 'Technology': 120,
 'Arts': 92,
 'Education': 87,
 'Communications-Media': 64,
 'Internet': 51,
 'Non-Profit': 46,
 'Engineering': 40,
 'Publishing': 27,
 'Law': 24,
 'Science': 22,
 'Religion': 21,
 'Marketing': 18,
 'Fashion': 17,
 'Consulting': 16,
 'Government': 15,
 'Advertising': 14,
 'Telecommunications': 12,
 'Sports-Recreation': 11,
 'BusinessServices': 11,
 'Museums-Libraries': 9,
 'Accounting': 9,
 'HumanResources': 9,
 'Banking': 8,
 'Transportation': 8,
 'RealEstate': 7,
 'Chemicals': 7,
 'Military': 5,
 'Environment': 4,
 'Architecture': 4,
 'Tourism': 4,
 'Biotech': 4,
 'Construction': 4,
 'LawEnforcement-Security': 3,
 'Automotive': 3,
 'Agriculture': 2,
 'Manufacturing': 1,
 'InvestmentBanking': 1}

In [174]:
dummy = sub_corpus.sign.value_counts()
sample_sign = dict(zip(dummy.index, dummy.values))
sample_sign

{'Cancer': 207,
 'Taurus': 193,
 'Aries': 190,
 'Virgo': 175,
 'Pisces': 174,
 'Scorpio': 173,
 'Libra': 169,
 'Aquarius': 155,
 'Leo': 151,
 'Capricorn': 147,
 'Gemini': 140,
 'Sagittarius': 126}

In [175]:
sample_final = dict(sample_gender)
sample_final.update(sample_age)
sample_final.update(sample_sign)
sample_final.update(sample_topic)
sample_final

{'male': 1032,
 'female': 968,
 24: 239,
 17: 234,
 23: 229,
 16: 214,
 25: 191,
 27: 148,
 26: 147,
 15: 121,
 14: 81,
 33: 65,
 35: 50,
 34: 50,
 13: 38,
 36: 36,
 38: 28,
 37: 23,
 39: 18,
 48: 16,
 42: 12,
 40: 11,
 41: 10,
 43: 10,
 46: 10,
 47: 8,
 45: 6,
 44: 5,
 'Cancer': 207,
 'Taurus': 193,
 'Aries': 190,
 'Virgo': 175,
 'Pisces': 174,
 'Scorpio': 173,
 'Libra': 169,
 'Aquarius': 155,
 'Leo': 151,
 'Capricorn': 147,
 'Gemini': 140,
 'Sagittarius': 126,
 'indUnk': 741,
 'Student': 459,
 'Technology': 120,
 'Arts': 92,
 'Education': 87,
 'Communications-Media': 64,
 'Internet': 51,
 'Non-Profit': 46,
 'Engineering': 40,
 'Publishing': 27,
 'Law': 24,
 'Science': 22,
 'Religion': 21,
 'Marketing': 18,
 'Fashion': 17,
 'Consulting': 16,
 'Government': 15,
 'Advertising': 14,
 'Telecommunications': 12,
 'Sports-Recreation': 11,
 'BusinessServices': 11,
 'Museums-Libraries': 9,
 'Accounting': 9,
 'HumanResources': 9,
 'Banking': 8,
 'Transportation': 8,
 'RealEstate': 7,
 'Chemical

In [176]:
mlb = MultiLabelBinarizer()

In [177]:
y_train.shape

(1500,)

In [178]:
y_train = list(y_train)
y_test  =list(y_test)
y_train[0]

['male', 17, 'Student', 'Aries']

In [179]:
for i in np.arange(0,1500):
    y_train[i] = tuple(y_train[i])
     



In [180]:
y_train = [[str(j) for j in i] for i in y_train]

In [182]:
y_test = [[str(j) for j in i] for i in y_test]

In [183]:
y_train_transformed = mlb.fit_transform(y_train)
y_test_transformed = mlb.transform(y_test)
y_train_transformed

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [184]:
y_train_transformed.shape

(1500, 78)

In [185]:
y_test_transformed.shape

(500, 78)

In [186]:
X_test_dtm.shape

(500, 179708)

In [187]:
X_train.shape

(1500,)

In [188]:
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', 'Accounting', 'Advertising', 'Agriculture',
       'Aquarius', 'Architecture', 'Aries', 'Arts', 'Automotive',
       'Banking', 'Biotech', 'BusinessServices', 'Cancer', 'Capricorn',
       'Chemicals', 'Communications-Media', 'Construction', 'Consulting',
       'Education', 'Engineering', 'Environment', 'Fashion', 'Gemini',
       'Government', 'HumanResources', 'Internet', 'Law',
       'LawEnforcement-Security', 'Leo', 'Libra', 'Manufacturing',
       'Marketing', 'Military', 'Museums-Libraries', 'Non-Profit',
       'Pisces', 'Publishing', 'RealEstate', 'Religion', 'Sagittarius',
       'Science', 'Scorpio', 'Sports-Recreation', 'Student', 'Taurus',
       'Technology', 'Telecommunications', 'Tourism', 'Transportation',
       'Virgo', 'female', 'indUnk', 'male'], dtype=object)

In [189]:
OVR = OneVsRestClassifier(LogisticRegression()).fit(X_train_dtm.toarray(),y_train_transformed)

In [190]:
OVR.score(X_train_dtm.toarray(), y_train_transformed)

0.9186666666666666

In [191]:
OVR.score(X_test_dtm.toarray(), y_test_transformed)

0.0

In [192]:
from sklearn.metrics import classification_report
result = OVR.predict(X_test_dtm.toarray())
result

array([[0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [193]:
target_names = mlb.classes_
print(classification_report(y_test_transformed, result, target_names=target_names))

                         precision    recall  f1-score   support

                     13       0.00      0.00      0.00         9
                     14       0.00      0.00      0.00        22
                     15       0.00      0.00      0.00        30
                     16       0.00      0.00      0.00        53
                     17       0.57      0.06      0.11        63
                     23       0.00      0.00      0.00        61
                     24       1.00      0.02      0.04        55
                     25       0.00      0.00      0.00        51
                     26       0.00      0.00      0.00        37
                     27       0.00      0.00      0.00        36
                     33       0.00      0.00      0.00         9
                     34       0.00      0.00      0.00        10
                     35       0.00      0.00      0.00        13
                     36       0.00      0.00      0.00        11
                     37 

# PREDEICTIONS

In [194]:
test = vect.transform([list(features)[0]])
prediction = OVR.predict(test)
print("Text", [list(features)[34]])
print("True", list(labels)[34])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(features)[100]])
prediction = OVR.predict(test)
print("Text", [list(features)[100]])
print("True", list(labels)[100])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(features)[123]])
prediction = OVR.predict(test)
print("Text", [list(features)[123]])
print("True", list(labels)[123])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(X_train)[2]])
prediction = OVR.predict(test)
print("Text", [list(X_train)[2]])
print("True", y_train[2])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")
test = vect.transform([list(X_test)[21]])
prediction = OVR.predict(test)
print("Text", [list(X_test)[21]])
print("True", y_test[21])
print("Prediction", mlb.inverse_transform(prediction))
print("------------------------\n")

Text ['initial explanation ty tyler thorsen blonde cutie moorhead haha minnesota dubbed soda pronounced strong minnesotan accent obvious reasons says reason cute sounds bobbys mom bobbys world dontcha bobby anyway pictures follow obviously hes charmer hit crazy middle excerpted email wait maybe point explain ty okay boy home totally weve dating year ian duh also ty girlie home recent girl girl nonetheless rate hit friends realized kind fill affection missing home turned dummyboygirlfriendsathome ish kinda tested boundaries ended liking lot supposed guess justawkwardly best friends guess soooo long distance relationship justyeah real story slept bed every night sounds weird wasntwe tight ian kinda miffed explained everythning oh physical kissed couldnt help tongues relax follow upwe talk almost every hes awesome friend hes debating home whether stay girl feelings kinda tossed loyalties lie home proud didnt anything drastic wants come ny early augusthm guess goes eh btw pictures picturet