In [0]:
import pandas as pd
import numpy as np

In [121]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load the dataset**

In [0]:
project_path = "/content/drive/My Drive/AIML/Projects/NLP/blog-authorship-corpus.zip"

In [0]:
from zipfile import ZipFile
with ZipFile(project_path, 'r') as z:
  z.extractall()

In [0]:
data = pd.read_csv("blogtext.csv")

In [125]:
data.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [126]:
data.shape

(681284, 7)

In [0]:
total_samples = 10000

In [128]:
data = data.sample(total_samples) # we are selecting randomly 10000 data
print('New Shape :',data.shape)

New Shape : (10000, 7)


In [129]:
data.reset_index(inplace=True)
data.head()

Unnamed: 0,index,id,gender,age,topic,sign,date,text
0,98108,3426442,female,34,indUnk,Aries,"02,July,2004",I have another blog where I cronicl...
1,341908,3627138,male,37,Arts,Gemini,"06,July,2004","It's official, urlLink John ..."
2,519407,3920708,male,25,Engineering,Virgo,"03,August,2004",This morning I lost an hour and a...
3,526374,2621277,female,17,Consulting,Aquarius,"04,April,2004",I'm a bit discouraged right now....it s...
4,658603,4026864,male,27,Technology,Taurus,"28,July,2004",If I should die this very...


**Preprocess rows of the “text” column**

In [130]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup

In [0]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

In [133]:
# Get the number of reviews based on the dataframe column size
num_reviews = data["text"].size
for i in np.arange( 0, num_reviews ):
    data["text"][i] =  review_to_words(data["text"][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
data.drop(columns=['index','id'], inplace=True)

In [0]:
data.drop(columns=['date'], inplace=True)

In [136]:
data.head()

Unnamed: 0,gender,age,topic,sign,text
0,female,34,indUnk,Aries,another blog cronicle consider important event...
1,male,37,Arts,Gemini,official urllink john kerry john edwards democ...
2,male,25,Engineering,Virgo,morning lost hour half life never get back sit...
3,female,17,Consulting,Aquarius,bit discouraged right seems last violin lesson...
4,male,27,Technology,Taurus,die moment fear never known completeness like ...


**Label columns to merge: “gender”, “age”, “topic”, “sign”**

In [0]:
data['labels'] = data[['gender', 'age','topic','sign']].apply(lambda x: ', '.join(x.astype(str)), axis=1)

In [138]:
data.head()

Unnamed: 0,gender,age,topic,sign,text,labels
0,female,34,indUnk,Aries,another blog cronicle consider important event...,"female, 34, indUnk, Aries"
1,male,37,Arts,Gemini,official urllink john kerry john edwards democ...,"male, 37, Arts, Gemini"
2,male,25,Engineering,Virgo,morning lost hour half life never get back sit...,"male, 25, Engineering, Virgo"
3,female,17,Consulting,Aquarius,bit discouraged right seems last violin lesson...,"female, 17, Consulting, Aquarius"
4,male,27,Technology,Taurus,die moment fear never known completeness like ...,"male, 27, Technology, Taurus"


**Separate features and labels, and split the data into training and testing**

In [0]:
X = data['text']
Y = data['labels']

In [140]:
X.head()

0    another blog cronicle consider important event...
1    official urllink john kerry john edwards democ...
2    morning lost hour half life never get back sit...
3    bit discouraged right seems last violin lesson...
4    die moment fear never known completeness like ...
Name: text, dtype: object

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state= 31)

In [143]:
print('X train shape:', X_train.shape)

X train shape: (7000,)


In [144]:
print('X Test shape:', X_test.shape)

X Test shape: (3000,)


In [145]:
print('Y train shape:', Y_train.shape)

Y train shape: (7000,)


In [146]:
print('Y test shape:', Y_test.shape)

Y test shape: (3000,)


**Vectorize the features**

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,2), max_features=100000, stop_words='english') #we are limiting top max_features because there are so many features which are hard to put in memory
train_features = vectorizer.fit_transform(X_train)

In [149]:
print('Train vectorizer features:', vectorizer.get_feature_names())



In [0]:
document_matrix = pd.DataFrame(train_features.toarray(), columns=vectorizer.get_feature_names())

In [0]:
test_features = vectorizer.transform(X_test)

In [152]:
document_matrix.head()

Unnamed: 0,aa,aaa,aaa firm,aaaaaand,aaaaah,aaahhhh,aaand,aahh,aaja,aan,aandava,aanyway,aaragorn,aaron,aarons,aaway,ab,aback,abandon,abandoned,abandoned temple,abandoning,abatul,abatul khaleej,abayat,abba,abbott,abbott costello,abbreviation,abbreviation long,abby,abc,abcess,abdominal,abducted,abduction,abdul,abe,abel,abel offering,...,zhr,zhu,zi,ziegler,ziegler communications,ziek,zillion,zimbabwe,zinc,zine,zing,zip,zipper,znxvat,zobel,zodiac,zodiac sign,zoe,zoe halloween,zoloft,zombie,zombies,zone,zone alarm,zones,zonked,zoo,zoolander,zoom,zooming,zu,zubeidi,zucchini,zurich,zwan,zz,zzz,zzzzzz,zzzzzzzz,zzzzzzzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Create a dictionary to get the count of every label**

In [0]:
def create_dictionary(dataSet):
  dictionary = dict()
  indexes = dataSet.index
  for i in indexes:
    labels = dataSet[i].split(', ')
    for label in labels:
      if (dictionary.get(label)):
        dictionary[label] = dictionary.get(label) + 1
      else:
        dictionary[label] = 1
  return dictionary

In [188]:
y_train_labels = create_dictionary(Y_train)
y_train_labels

{'13': 130,
 '14': 288,
 '15': 413,
 '16': 746,
 '17': 867,
 '23': 751,
 '24': 819,
 '25': 682,
 '26': 538,
 '27': 511,
 '33': 177,
 '34': 220,
 '35': 171,
 '36': 141,
 '37': 101,
 '38': 81,
 '39': 53,
 '40': 43,
 '41': 40,
 '42': 29,
 '43': 43,
 '44': 22,
 '45': 46,
 '46': 29,
 '47': 20,
 '48': 39,
 'Accounting': 23,
 'Advertising': 40,
 'Agriculture': 12,
 'Aquarius': 513,
 'Architecture': 18,
 'Aries': 695,
 'Arts': 351,
 'Automotive': 16,
 'Banking': 42,
 'Biotech': 22,
 'BusinessServices': 46,
 'Cancer': 675,
 'Capricorn': 498,
 'Chemicals': 43,
 'Communications-Media': 210,
 'Construction': 11,
 'Consulting': 58,
 'Education': 309,
 'Engineering': 129,
 'Environment': 7,
 'Fashion': 43,
 'Gemini': 505,
 'Government': 70,
 'HumanResources': 27,
 'Internet': 164,
 'InvestmentBanking': 14,
 'Law': 90,
 'LawEnforcement-Security': 23,
 'Leo': 558,
 'Libra': 629,
 'Manufacturing': 20,
 'Maritime': 2,
 'Marketing': 39,
 'Military': 30,
 'Museums-Libraries': 26,
 'Non-Profit': 150,
 'Pis

In [189]:
y_test_labels = create_dictionary(Y_test)
y_test_labels

{'13': 57,
 '14': 128,
 '15': 194,
 '16': 314,
 '17': 372,
 '23': 353,
 '24': 365,
 '25': 281,
 '26': 235,
 '27': 189,
 '33': 72,
 '34': 82,
 '35': 70,
 '36': 61,
 '37': 32,
 '38': 27,
 '39': 25,
 '40': 20,
 '41': 17,
 '42': 12,
 '43': 22,
 '44': 7,
 '45': 24,
 '46': 15,
 '47': 12,
 '48': 14,
 'Accounting': 17,
 'Advertising': 18,
 'Agriculture': 3,
 'Aquarius': 224,
 'Architecture': 6,
 'Aries': 283,
 'Arts': 136,
 'Automotive': 2,
 'Banking': 14,
 'Biotech': 12,
 'BusinessServices': 23,
 'Cancer': 280,
 'Capricorn': 218,
 'Chemicals': 22,
 'Communications-Media': 97,
 'Construction': 7,
 'Consulting': 23,
 'Education': 133,
 'Engineering': 45,
 'Environment': 2,
 'Fashion': 20,
 'Gemini': 253,
 'Government': 35,
 'HumanResources': 11,
 'Internet': 72,
 'InvestmentBanking': 6,
 'Law': 38,
 'LawEnforcement-Security': 6,
 'Leo': 245,
 'Libra': 275,
 'Manufacturing': 12,
 'Maritime': 1,
 'Marketing': 19,
 'Military': 21,
 'Museums-Libraries': 24,
 'Non-Profit': 66,
 'Pisces': 213,
 'Publ

**Transform the labels**

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

In [0]:
multiLabel = MultiLabelBinarizer()

In [0]:
y_train_transformed = multiLabel.fit_transform(Y_train)

In [0]:
y_test_transformed = multiLabel.transform(Y_test)

In [250]:
y_train_transformed.shape

(7000, 53)

In [251]:
y_test_transformed.shape

(3000, 53)

**Choose a classifier**

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
logistic = LogisticRegression(solver='lbfgs')

In [0]:
model = OneVsRestClassifier(logistic)

In [255]:
model.fit(train_features, y_train_transformed)

  str(classes[c]))
  str(classes[c]))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/s

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

**Fit the classifier,**

In [0]:
pred_data = model.predict(test_features)

In [0]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

In [258]:
print("F1: " + str(f1_score(y_test_transformed, pred_data, average='micro')))
print("F1_macro: " + str(f1_score(y_test_transformed, pred_data, average='macro')))
print("Recall: " + str(recall_score(y_test_transformed, pred_data, average='micro')))
print("Precision: " + str(precision_score(y_test_transformed, pred_data, average='micro')))
print("Accuracy:" + str(accuracy_score(y_test_transformed, pred_data))) 

F1: 0.7084379223788376
F1_macro: 0.29707700920357066
Recall: 0.6272747995896812
Precision: 0.8137259733859044
Accuracy:0.0013333333333333333
