In [74]:
from zipfile import ZipFile
import numpy as np

import pandas as pd
import nltk 
from nltk.corpus import stopwords
import re
import spacy
import unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
#from contractions import CONTRACT_MAP

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from collections import defaultdict


###### 1. Load the dataset  (5 points)

In [2]:
filename = "blog-authorship-corpus.zip"

In [3]:
df = pd.read_csv(filename, compression='zip', header=0, sep=',', quotechar='"')

In [4]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [6]:
df.age.value_counts()

17    80859
24    80071
23    72889
16    72708
25    67051
26    55312
27    46124
15    41767
14    27400
34    21347
33    17584
35    17462
36    14229
13    13133
37     9317
38     7545
39     5556
40     5016
45     4482
43     4230
41     3738
48     3572
42     2908
46     2733
47     2207
44     2044
Name: age, dtype: int64

In [7]:
text = df['text']

In [8]:
len(text)

681284

In [9]:
sorted(set(text))

['    ',
 '     ',
 '      ',
 '       ',
 "       \x02 second day of driver ed. It was insanely torcherous. We were couped up in the classroom for 4 stinking hours without a break because he forgot to give us one..or two..I was seriously about to kill someone. we didn't even talk about driving most of the time!!! On the bright side...this whole thing will be done in a month and I can have a summer!! It has been raining here. And cold...like winter...without snow. I want it to be summer!!! I am sick of rain. I want it to be warm. Maybe next week...         ",
 "       \x16Post.  This is my blog.  Yo.  Yo.  Kick it DJ, spin dat beat. I left Florida Friday night and got to Dubuque, Iowa Saturday night.  I met Sara Jean at The Village Inn on Dodge St. and we drank coffee and had french toast.  It was good.  Then we went to her house, showered up and went to a Salsa House Party.  Salsa as in dancing, not a mexican dip party.  The party was wild and out of control and huge.  So many sexy wo

In [10]:
text

0                    Info has been found (+/- 100 pages,...
1                    These are the team members:   Drewe...
2                    In het kader van kernfusie op aarde...
3                          testing!!!  testing!!!          
4                      Thanks to Yahoo!'s Toolbar I can ...
                                ...                        
681279           Dear Susan,  I could write some really ...
681280           Dear Susan,  'I have the second yeast i...
681281           Dear Susan,  Your 'boyfriend' is fuckin...
681282           Dear Susan:    Just to clarify, I am as...
681283           Hey everybody...and Susan,  You might a...
Name: text, Length: 681284, dtype: object

###### 2. Preprocess rows of the “text” column (7.5 points)
a. Remove unwanted characters
b. Convert text to lowercase
c. Remove unwanted spaces
d. Remove stopwords


In [11]:
def remove_accented_chars(txt):
    txt=unicodedata.normalize('NFKD',txt).encode('ascii','ignore').decode('utf-8','ignore')
    return txt

In [12]:
def remove_special_chars(txt, remove_digits=True):
    pattern = r'[^a-zA-Z\s]'
    txt = re.sub(pattern," ",txt)
    return txt

In [13]:
def lower_chars(txt):
    txt=txt.lower()
    return txt

In [14]:
def remove_extra_spaces(txt):
    txt=" ".join(txt.split())
    return txt

In [15]:
tokenizer = ToktokTokenizer()
stopword_list = stopwords.words('English')
stopword_list.remove('no')
stopword_list.remove('not')

def remove_stop_words(txt):
    tokens = tokenizer.tokenize(txt)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text  

In [16]:
stopwords.words('English')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

Example of applying all pre processing. We can see that all korean letters has been removed from output along with special chars, extra spaces, stop words. 

In [17]:
text[70]

"             Ya, I'm off to Canada/Vancouver again soon...ah, to be back in the land of fresh air, real mountains and diapers. Luckily, I think my oldest boy is out potty-trained now. Not sure, though, haven't been home for about 2 months...and a lot can happen in that time.  Now that we've done a deal here my boss/CEO would have me here all of August (well, all of the year, actually) but I have to go back as it's my younger son's first birthday, or  urlLink 첫돌/Chot-dol  (they have a different name for almost everything here). For Koreans the 1st and 60th birthdays are the biggies. 1st because the child made it through the often-treacherous first year (remember, Korea used to be really, really poor...so much so that is was a 100-days celebration as well, but that is starting to become less important as more babies are living past it). 60th because the  urlLink Korean Zodiac  is similar to the Chinese one...with 12 animals (ya, I'm a pig). There are also five colors which elude me at t

In [18]:
remove_stop_words(remove_extra_spaces(lower_chars(remove_special_chars(remove_accented_chars(text[70])))))

'ya canada vancouver soon ah back land fresh air real mountains diapers luckily think oldest boy potty trained not sure though home months lot happen time done deal boss ceo would august well year actually go back younger son first birthday urllink chot dol different name almost everything koreans st th birthdays biggies st child made often treacherous first year remember korea used really really poor much days celebration well starting become less important babies living past th urllink korean zodiac similar chinese one animals ya pig also five colors elude moment although know two black gold chinese use five elements metal earth wood fire water year urllink year wood monkey guess better urllink year wood cock rooster hehe put anyways age gone years times colors elements truly magical time guess thus age men no mention women least one grandchild preferably grandson one reason happy oldest boy wife father th year foreingers call non koreans even canada reason korea called urllink hermi

In [19]:
clean_text = []
for i in range(len(text)):
    filtered = remove_stop_words(remove_extra_spaces(lower_chars(remove_special_chars(remove_accented_chars(text[i])))))
    clean_text.append(filtered)

In [23]:
clean_text[70]

'ya canada vancouver soon ah back land fresh air real mountains diapers luckily think oldest boy potty trained not sure though home months lot happen time done deal boss ceo would august well year actually go back younger son first birthday urllink chot dol different name almost everything koreans st th birthdays biggies st child made often treacherous first year remember korea used really really poor much days celebration well starting become less important babies living past th urllink korean zodiac similar chinese one animals ya pig also five colors elude moment although know two black gold chinese use five elements metal earth wood fire water year urllink year wood monkey guess better urllink year wood cock rooster hehe put anyways age gone years times colors elements truly magical time guess thus age men no mention women least one grandchild preferably grandson one reason happy oldest boy wife father th year foreingers call non koreans even canada reason korea called urllink hermi

###### 3. As we want to make this into a multi-label classification problem, you are required to merge
all the label columns together, so that we have all the labels together for a particular sentence
(7.5 points)
a. Label columns to merge: “gender”, “age”, “topic”, “sign”
b. After completing the previous step, there should be only two columns in your data
frame i.e. “text” and “labels” as shown in the below image

In [24]:
df['gender']=df['gender'].astype('str')
df['age']=df['age'].astype('str')
df['topic']=df['topic'].astype('str')
df['sign']=df['sign'].astype('str')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  object
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(1), object(6)
memory usage: 36.4+ MB


In [26]:
final_df = pd.DataFrame(columns=['text','labels'])

In [27]:
label = df['gender'] + ', '+ df['age'] +  ', ' + df['topic'] + ', ' + df['sign']

In [28]:
labels = '[' + df['gender'] + ', '+ df['age'] +  ', ' + df['topic'] + ', ' + df['sign'] + ']'

In [29]:
labels.head()

0                   [male, 15, Student, Leo]
1                   [male, 15, Student, Leo]
2                   [male, 15, Student, Leo]
3                   [male, 15, Student, Leo]
4    [male, 33, InvestmentBanking, Aquarius]
dtype: object

In [30]:
final_df['text']=clean_text
final_df['labels']=labels

In [31]:
final_df.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


###### 4. Separate features and labels, and split the data into training and testing (5 points)

In [32]:
final_df['label']=label

In [33]:
final_df.head()

Unnamed: 0,text,labels,label
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]","male, 15, Student, Leo"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]","male, 15, Student, Leo"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]","male, 15, Student, Leo"
3,testing testing,"[male, 15, Student, Leo]","male, 15, Student, Leo"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]","male, 33, InvestmentBanking, Aquarius"


In [34]:
X = final_df['text']
y = final_df['label']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =10)

In [36]:
X_train.shape

(510963,)

In [37]:
y_train.shape

(510963,)

In [38]:
X_test.shape

(170321,)

In [39]:
y_test.shape

(170321,)

In [40]:
X_train[10]

'ah korean language looks difficult first figure read hanguel korea surprisingly easy learn alphabet characters seems easy vocabulary starts oh no backwards us sentence structure yikes luckily many options us slow witted foreigners take language course could list urllink joongang article says lot resources urllink well guy motivation jeon ji hyun latest something actually star movies cfs hear means commercial feature not positive saw latest movie sunday night hard describe name english version windstruck korean version yeochinso short ne yeojachingu rul sogayhamnida like introduce girlfriend surprisingly titles make sense like website korean english looks quite good actually urllink movie shown theatres subtitles special times info urllink list many theatres seoul click urllink urllink great reason learn korean already married went foreigners well local korean national course korean take picture put urllink movie hof bar update bud mine passed urllink link giordano ad apparently not ai

In [41]:
y_train[10]

'male, 33, InvestmentBanking, Aquarius'

###### 5. Vectorize the features (5 points)
a. Create a Bag of Words using count vectorizer
i. Use ngram_range=(1, 2)
ii. Vectorize training and testing features
b. Print the term-document matrix


In [42]:
vect = CountVectorizer(ngram_range=(1,2))

In [43]:
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [44]:
X_train_dtm.shape

(510963, 17491472)

In [45]:
X_test_dtm.shape

(170321, 17491472)

In [46]:
vect.get_feature_names()

['aa',
 'aa aa',
 'aa aaa',
 'aa aaaa',
 'aa aaaaa',
 'aa aaaaaa',
 'aa aaaaaaa',
 'aa aaaaaaaa',
 'aa aaaaaaaaa',
 'aa aaaaaaaaaaaa',
 'aa aac',
 'aa aai',
 'aa ab',
 'aa abe',
 'aa ac',
 'aa ace',
 'aa ad',
 'aa advert',
 'aa aeeca',
 'aa affiliate',
 'aa afraid',
 'aa aj',
 'aa alanon',
 'aa albano',
 'aa alcohol',
 'aa ald',
 'aa alkaline',
 'aa already',
 'aa alright',
 'aa also',
 'aa always',
 'aa amazing',
 'aa amount',
 'aa amp',
 'aa angaxaamatqebcyabaceaaaaymkyynjy',
 'aa anger',
 'aa aniga',
 'aa anonymous',
 'aa anyway',
 'aa aolh',
 'aa apparently',
 'aa asked',
 'aa asks',
 'aa assertive',
 'aa au',
 'aa autumn',
 'aa awesome',
 'aa awwmwm',
 'aa back',
 'aa backup',
 'aa ball',
 'aa baseball',
 'aa bateries',
 'aa batteried',
 'aa batteries',
 'aa battery',
 'aa bb',
 'aa bdaee',
 'aa beautiful',
 'aa began',
 'aa begin',
 'aa benefits',
 'aa bet',
 'aa bf',
 'aa bhi',
 'aa bhii',
 'aa big',
 'aa bigbook',
 'aa bile',
 'aa birmingham',
 'aa bitch',
 'aa blogging',
 'aa 

In [47]:
pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names()).head()

MemoryError: Unable to allocate 21.7 TiB for an array with shape (170321, 17491472) and data type int64

In [None]:
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names()).head()

###### 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will
be the total count of the label. Check below image for reference (5 points)


In [48]:
dict_gender = dict(df['gender'].value_counts())

In [49]:
dict_gender

{'male': 345193, 'female': 336091}

In [50]:
dict_age = dict(df['age'].value_counts())

In [51]:
dict_topic = dict(df['topic'].value_counts())

In [52]:
dict_sign = dict(df['sign'].value_counts())

In [53]:
dictionary = {**dict_gender, **dict_age, **dict_topic, **dict_sign}

In [54]:
dictionary

{'male': 345193,
 'female': 336091,
 '17': 80859,
 '24': 80071,
 '23': 72889,
 '16': 72708,
 '25': 67051,
 '26': 55312,
 '27': 46124,
 '15': 41767,
 '14': 27400,
 '34': 21347,
 '33': 17584,
 '35': 17462,
 '36': 14229,
 '13': 13133,
 '37': 9317,
 '38': 7545,
 '39': 5556,
 '40': 5016,
 '45': 4482,
 '43': 4230,
 '41': 3738,
 '48': 3572,
 '42': 2908,
 '46': 2733,
 '47': 2207,
 '44': 2044,
 'indUnk': 251015,
 'Student': 153903,
 'Technology': 42055,
 'Arts': 32449,
 'Education': 29633,
 'Communications-Media': 20140,
 'Internet': 16006,
 'Non-Profit': 14700,
 'Engineering': 11653,
 'Law': 9040,
 'Publishing': 7753,
 'Science': 7269,
 'Government': 6907,
 'Consulting': 5862,
 'Religion': 5235,
 'Fashion': 4851,
 'Marketing': 4769,
 'Advertising': 4676,
 'BusinessServices': 4500,
 'Banking': 4049,
 'Chemicals': 3928,
 'Telecommunications': 3891,
 'Accounting': 3832,
 'Military': 3128,
 'Museums-Libraries': 3096,
 'Sports-Recreation': 3038,
 'HumanResources': 3010,
 'RealEstate': 2870,
 'Trans

###### 7. Transform the labels - (7.5 points)
As we have noticed before, in this task each example can have multiple tags. To deal with
such kind of prediction, we need to transform labels in a binary form and the prediction will be
a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn
a. Convert your train and test labels using MultiLabelBinarizer

In [58]:
y_train.shape

(510963,)

In [59]:
y_test.shape

(170321,)

In [91]:
y_train[1]

'male, 15, Student, Leo'

In [94]:
df1 = y_train.str.split(',', expand=True)
df1.head()

Unnamed: 0,0,1,2,3
208819,male,14,indUnk,Scorpio
301302,female,25,indUnk,Leo
459378,male,14,Student,Scorpio
134831,female,26,Publishing,Cancer
282930,male,16,Student,Cancer


In [95]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510963 entries, 208819 to 345353
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       510963 non-null  object
 1   1       510963 non-null  object
 2   2       510963 non-null  object
 3   3       510963 non-null  object
dtypes: object(4)
memory usage: 39.5+ MB


In [111]:
df1[0].nunique()

2

In [108]:
df1[1].nunique()

26

In [109]:
df1[2].nunique()

40

In [110]:
df1[3].nunique()

12

total 80 unique values in all four columns.

In [78]:
d = defaultdict(MultiLabelBinarizer)

In [189]:
list_encoded = []  # store single matrices
for column in df1:
    d[column].fit(df1[column])
    list_encoded.append(d[column].transform(df1[column]))
y_train_encoded = np.hstack(list_encoded)

In [190]:
y_train_encoded.shape

(510963, 80)

In [191]:
y_train_encoded[1]

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [192]:
df1_test = y_test.str.split(',', expand=True)
df1_test.head()

Unnamed: 0,0,1,2,3
262626,female,27,Consulting,Aquarius
673780,female,36,indUnk,Pisces
322119,female,15,Education,Aquarius
52408,male,27,Religion,Pisces
313202,male,17,Student,Gemini


In [193]:
df1_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170321 entries, 262626 to 284239
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       170321 non-null  object
 1   1       170321 non-null  object
 2   2       170321 non-null  object
 3   3       170321 non-null  object
dtypes: object(4)
memory usage: 11.5+ MB


In [194]:
list_encoded_test = []  # store single matrices
for column in df1_test:
    list_encoded_test.append(d[column].transform(df1_test[column]))
y_test_encoded = np.hstack(list_encoded_test)

In [196]:
y_test_encoded.shape

(170321, 80)

###### 8. Choose a classifier - (5 points)
In this task, we suggest using the One-vs-Rest approach, which is implemented in
OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a
basic classifier, use LogisticRegression. It is one of the simplest methods, but often it
performs good enough in text classification tasks. It might take some time because the
number of classifiers to train is large.
a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on
every label
b. As One-vs-Rest approach might not have been discussed in the sessions, we are
providing you the code for that

In [197]:
clf = LogisticRegression(solver = 'lbfgs')
clf = OneVsRestClassifier (clf)


###### 9. Fit the classifier, make predictions and get the accuracy (5 points)
a. Print the following
i. Accuracy score
ii. F1 score
iii. Average precision score
iv. Average recall score
v. Tip: Make sure you are familiar with all of them. How would you expect the
things to work for the multi-label scenario? Read about micro/macro/weighted
averaging


In [198]:
clf.fit(X_train_dtm, y_train_encoded)

  str(classes[c]))
  str(classes[c]))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for a

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [199]:
y_pred_class = clf.predict(X_test_dtm)

In [200]:
y_pred_class_train = clf.predict(X_train_dtm)

In [201]:
# calculate accuracy
print ('Train Accuracy:' , metrics.accuracy_score(y_train_encoded, y_pred_class_train))
print ('Test Accuracy:' , metrics.accuracy_score(y_test_encoded, y_pred_class))

Train Accuracy: 0.07397208799854392
Test Accuracy: 0.028299505052224918


In [202]:
# calculate accuracy
print (metrics.classification_report(y_test_encoded, y_pred_class))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    170321
           1       1.00      1.00      1.00    170321
           2       0.76      0.52      0.62     83868
           3       1.00      1.00      1.00    170321
           4       1.00      1.00      1.00    170321
           5       1.00      1.00      1.00    170321
           6       0.78      0.26      0.39      1208
           7       0.84      0.61      0.71     60141
           8       0.74      0.53      0.62     81002
           9       0.58      0.20      0.30     45763
          10       0.54      0.18      0.27     40115
          11       0.51      0.13      0.21     32729
          12       0.57      0.15      0.23     35981
          13       0.56      0.18      0.27     34721
          14       0.73      0.13      0.22      2782
          15       0.58      0.06      0.11      1386
          16       1.00      1.00      1.00    170321
          17       0.39    

###### 10. Print true label and predicted label for any five examples (7.5 points)


In [204]:
y_test[8]

'male, 33, InvestmentBanking, Aquarius'

In [212]:
y_test_encoded[8]

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [205]:
y_pred_class[8]

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [206]:
example1 = y_pred_class[8]

In [211]:
d[1].inverse_transform(example1[:2])

IndexError: tuple index out of range

In [177]:
list1, list2, list3, list4, list5 = np.hsplit(merged[0],np.array([2,26,40,12]))

In [178]:
list1.shape

(2,)

In [187]:
df2 = pd.DataFrame(np.hstack((list1, list2, list3 , list4, list5)))

In [188]:
df2.head()

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,1


In [136]:
sep_list

[array([[1, 1],
        [1, 1],
        [1, 1],
        ...,
        [1, 1],
        [1, 1],
        [1, 1]]),
 array([[0, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [0, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [0, 1, 1, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 1, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 1, 0],
        [0, 0, 0, ..., 1, 0, 0]]),
 array([], shape=(510963, 0), dtype=int32),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])]

In [146]:
df1_decode = pd.DataFrame(sep_list)

In [147]:
df1_decode.head()

Unnamed: 0,0
0,"[[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1..."
1,"[[0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,..."
2,"[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0], [..."
3,"[[], [], [], [], [], [], [], [], [], [], [], [..."
4,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [166]:
d[0].inverse_transform((np.hsplit(merged[0],np.array([2,26,40,12]))[0].values))

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [160]:
list_decoded = []
for column in df1:
        list_decoded.extend(d[column].inverse_transform(np.hsplit(merged[0],np.array([2,26,40,12])[column])))


AttributeError: 'list' object has no attribute 'shape'