In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('blogtext.csv',nrows=20000)

In [3]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')
stops = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
filtered_df = df.dropna()

In [5]:
filtered_df['text'] = filtered_df['text'].apply(lambda s: s.lower())
filtered_df['text'] = filtered_df['text'].apply(lambda s: re.sub('[^0-9a-z #+_]','',s))
filtered_df['text'] = filtered_df['text'].apply(lambda s: s.strip())

In [6]:
filtered_df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info has been found + 100 pages and 45 mb of p...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team members drewes van der la...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusie op aarde maak je e...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks to yahoos toolbar i can now capture the...


In [7]:
def identify_tokens(row):
    review = row['text']
    tokens = nltk.word_tokenize(review)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

filtered_df['text'] = filtered_df.apply(identify_tokens, axis=1)

In [8]:
filtered_df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","[info, has, been, found, pages, and, mb, of, p..."
1,2059027,male,15,Student,Leo,"13,May,2004","[these, are, the, team, members, drewes, van, ..."
2,2059027,male,15,Student,Leo,"12,May,2004","[in, het, kader, van, kernfusie, op, aarde, ma..."
3,2059027,male,15,Student,Leo,"12,May,2004","[testing, testing]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004","[thanks, to, yahoos, toolbar, i, can, now, cap..."


In [9]:
def remove_stops(row):
    my_list = row['text']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)

filtered_df['text'] = filtered_df.apply(remove_stops, axis=1)

In [10]:
filtered_df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","[info, found, pages, mb, pdf, files, wait, unt..."
1,2059027,male,15,Student,Leo,"13,May,2004","[team, members, drewes, van, der, laag, urllin..."
2,2059027,male,15,Student,Leo,"12,May,2004","[het, kader, van, kernfusie, op, aarde, maak, ..."
3,2059027,male,15,Student,Leo,"12,May,2004","[testing, testing]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004","[thanks, yahoos, toolbar, capture, urls, popup..."


In [11]:
def rejoin_words(row):
    my_list = row['text']
    joined_words = ( " ".join(my_list))
    return joined_words

filtered_df['text'] = filtered_df.apply(rejoin_words, axis=1)

In [12]:
filtered_df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


In [13]:
filtered_df['label']='[' + filtered_df['gender'].astype(str) + ', ' + filtered_df['age'].astype(str) + ', ' + filtered_df['topic'].astype(str) + ', ' + filtered_df['sign'].astype(str) + ']'

In [14]:
filtered_df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text,label
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


In [15]:
df_new = filtered_df[['text', 'label']].copy()

In [16]:
df_new.head(5)

Unnamed: 0,text,label
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoos toolbar capture urls popupswhich...,"[male, 33, InvestmentBanking, Aquarius]"


In [17]:
df_new.shape

(20000, 2)

In [18]:
from sklearn.model_selection import train_test_split
X = df_new['text']
y = df_new['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

In [19]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(13400,)
(13400,)
(6600,)
(6600,)


In [42]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(ngram_range=(1,2),stop_words='english',min_df=2)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [43]:
X_train_dtm.shape

(13400, 130291)

In [44]:
print(X_train_dtm)
print(X_test_dtm)

  (0, 65032)	1
  (0, 102308)	1
  (0, 17168)	1
  (0, 79641)	1
  (0, 108781)	1
  (0, 35800)	1
  (0, 6514)	1
  (0, 40495)	1
  (0, 8464)	1
  (0, 61765)	1
  (0, 34512)	1
  (0, 102309)	1
  (0, 79667)	1
  (0, 40534)	1
  (0, 62179)	1
  (1, 10490)	2
  (1, 22410)	1
  (1, 1906)	2
  (1, 124314)	4
  (1, 16065)	1
  (1, 66846)	1
  (1, 51644)	1
  (1, 22223)	1
  (1, 10436)	1
  (1, 20593)	1
  :	:
  (13397, 33593)	1
  (13397, 33596)	1
  (13397, 53272)	1
  (13397, 9670)	1
  (13397, 122235)	1
  (13397, 106080)	1
  (13397, 8378)	1
  (13397, 88807)	2
  (13397, 128330)	1
  (13398, 128801)	1
  (13398, 111069)	1
  (13398, 128172)	1
  (13398, 124883)	1
  (13398, 98444)	1
  (13398, 46141)	1
  (13398, 124949)	1
  (13398, 103041)	1
  (13398, 46230)	1
  (13398, 103062)	1
  (13398, 128955)	1
  (13399, 112738)	1
  (13399, 42943)	1
  (13399, 129440)	1
  (13399, 43172)	1
  (13399, 113582)	1
  (0, 2964)	1
  (0, 15191)	1
  (0, 15254)	1
  (0, 41222)	1
  (0, 54752)	1
  (0, 54928)	1
  (0, 61765)	1
  (0, 70199)	1
  (0, 82808)

In [35]:
dict(df_new['label'].str.split(expand=True).stack().value_counts())

{'[male,': 11354,
 '[female,': 8646,
 'indUnk,': 7789,
 'Aries]': 5209,
 'Technology,': 2989,
 'Student,': 2637,
 '35,': 2494,
 '27,': 2320,
 'Sagittarius]': 2153,
 '23,': 1963,
 '17,': 1961,
 'Leo]': 1732,
 '36,': 1726,
 'Pisces]': 1678,
 'Fashion,': 1622,
 '24,': 1557,
 'Cancer]': 1536,
 'Scorpio]': 1485,
 'Taurus]': 1330,
 'Aquarius]': 1313,
 '16,': 1236,
 '25,': 1190,
 '15,': 1097,
 'Libra]': 983,
 'Capricorn]': 930,
 '26,': 919,
 '34,': 871,
 'Virgo]': 871,
 '14,': 811,
 'Gemini]': 780,
 'Internet,': 778,
 '33,': 769,
 'Education,': 759,
 'Communications-Media,': 414,
 'Arts,': 358,
 'Engineering,': 357,
 '48,': 240,
 'Marketing,': 207,
 'Non-Profit,': 204,
 '46,': 188,
 'Government,': 187,
 'BusinessServices,': 184,
 'Religion,': 182,
 'Consulting,': 166,
 '37,': 130,
 'Sports-Recreation,': 120,
 '13,': 113,
 'Automotive,': 111,
 '39,': 105,
 'Manufacturing,': 93,
 'LawEnforcement-Security,': 90,
 'Banking,': 89,
 'Science,': 87,
 '38,': 85,
 '41,': 82,
 '45,': 72,
 'InvestmentBa

In [45]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [46]:
y_train_new = mlb.fit_transform(y_train)
y_test_new = mlb.fit_transform(y_test)

In [47]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

In [48]:
clf.fit(X_train_dtm, y_train_new)

  str(classes[c]))
  str(classes[c]))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/s

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [49]:
y_pred_clf = clf.predict(X_test_dtm)

In [50]:
y_pred_clf.shape

(6600, 54)

In [51]:
y_test_new.shape

(6600, 55)

In [53]:
print(metrics.accuracy_score(y_test_new, y_pred_clf))

ValueError: inconsistent shapes