In [54]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\K\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
num_rows = 100
df = pd.read_csv('blogtext.csv', nrows=num_rows)
df.head()


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [56]:
def preprocess_text(text):
    global stop_words
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = ' '.join(text.split())
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    text = ' '.join([word for word in tokens if word.lower() not in stop_words])
    
    return text

df['text'] = df['text'].apply(preprocess_text)

df.head()


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


In [57]:

label_columns = ["gender", "age", "topic", "sign"]

df['labels'] = df.apply(lambda row: ','.join([str(row[col]) for col in label_columns]), axis=1)

df.drop(columns=label_columns, inplace=True)

df.head()


Unnamed: 0,id,date,text,labels
0,2059027,"14,May,2004",info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,2059027,"13,May,2004",team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,2059027,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,2059027,"12,May,2004",testing testing,"male,15,Student,Leo"
4,3581210,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,"male,33,InvestmentBanking,Aquarius"


In [58]:
X = df['text'].values
y = df['labels'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (80,)
X_test shape: (20,)
y_train shape: (80,)
y_test shape: (20,)


In [59]:
X_train

array(['ok youre probably looking title wondering heck connection tenuous goes first four countries likely drive although drive philippines phil younger days first three amount nuttiness ive either witnessed assumed last one dont think id spend much time away beach ok korean driving stuff ive almost years always get question car answer think pretty logical subway system cheap easy pretty fast traffic ii bus system prefect know one take quite nice iii firm driver urllink hyundai urllink equus visits company business iv taxis everywhere city pretty inexpensive v work like meters home iv driving like canada either id cause accident get used way cause one drive back home thus need car actually try stay roads whenever possible perhaps good reason korea one highest traffic fatality injury record developeddeveloping world really quite amazing take look articles urllink urllink youll see mean one reason urllink many koreans use seatbelts another article urllink even dont im back seatespecially

In [60]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [61]:
y_train

array(['male,33,InvestmentBanking,Aquarius', 'female,14,indUnk,Aries',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius', 'female,25,indUnk,Capricorn',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius', 'female,14,indUnk,Aries',
       'male,33,InvestmentBanking,Aquarius', 'female,14,indUnk,Aries',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius',
       'male,33,InvestmentBanking,Aquarius', 'female,25,indUnk,Capricorn',
       

In [62]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

X_train_vectorized = vectorizer.fit_transform(X_train)

X_test_vectorized = vectorizer.transform(X_test)

feature_names = vectorizer.get_feature_names_out()

print("Term-Document Matrix (Training Set):")
print(pd.DataFrame(X_train_vectorized.toarray(), columns=feature_names))

print("Term-Document Matrix (Testing Set):")
print(pd.DataFrame(X_test_vectorized.toarray(), columns=feature_names))


Term-Document Matrix (Training Set):
    aaldering  aaldering urllink  aarde  aarde maak  abandons  abandons hate  \
0           0                  0      0           0         0              0   
1           0                  0      0           0         0              0   
2           0                  0      0           0         0              0   
3           0                  0      0           0         0              0   
4           0                  0      0           0         0              0   
..        ...                ...    ...         ...       ...            ...   
75          0                  0      0           0         0              0   
76          0                  0      0           0         0              0   
77          0                  0      0           0         0              0   
78          0                  0      0           0         0              0   
79          0                  0      0           0         0              0   

  

In [63]:

labels_column = df['labels']

all_labels = [label for labels in labels_column.str.split(',') for label in labels]

label_counts = Counter(all_labels)

for label, count in label_counts.items():
    print(f"{label}: {count}")


male: 74
15: 4
Student: 7
Leo: 4
33: 70
InvestmentBanking: 70
Aquarius: 70
female: 26
14: 21
indUnk: 23
Aries: 21
25: 2
Capricorn: 2
17: 3
Gemini: 3


In [64]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train_binary = mlb.fit_transform([labels.split(',') for labels in y_train])

y_test_binary = mlb.transform([labels.split(',') for labels in y_test])

print("Transformed Training Labels:")
print(y_train_binary)

print("\nTransformed Testing Labels:")
print(y_test_binary)


Transformed Training Labels:
[[0 0 0 ... 0 0 1]
 [1 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 1]
 [1 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 1]]

Transformed Testing Labels:
[[1 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [1 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 1 0 0 0 0 0 0 0 0 1 1 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [1 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [1 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [1 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]]


In [65]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

classifier = LogisticRegression(solver='lbfgs')

ovr_classifier = OneVsRestClassifier(classifier)

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', ovr_classifier)
])

pipeline.fit(X_train, y_train_binary)

y_pred_binary = pipeline.predict(X_test)

print("Predicted Testing Labels (Binary Form):")
print(y_pred_binary)


Predicted Testing Labels (Binary Form):
[[0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [1 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 1 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]


In [66]:


y_pred_proba = pipeline.predict_proba(X_test)


threshold = 0.5
y_pred_binary_thresholded = (y_pred_proba > threshold).astype(int)

accuracy = accuracy_score(y_test_binary, y_pred_binary_thresholded)
f1 = f1_score(y_test_binary, y_pred_binary_thresholded, average='weighted') 
average_precision = average_precision_score(y_test_binary, y_pred_proba, average='weighted')
average_recall = recall_score(y_test_binary, y_pred_binary_thresholded, average='weighted')


print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Average Precision Score: {average_precision:.4f}")
print(f"Average Recall Score: {average_recall:.4f}")


Accuracy: 0.6000
F1 Score: 0.7785
Average Precision Score: 0.9975
Average Recall Score: 0.6875


In [67]:
y_pred_binary_thresholded.shape

(20, 15)

In [68]:
y_test_binary.shape

(20, 15)

In [69]:

for i in range(5):
    print(f"Example {i + 1} - Text: {selected_X_test[i]}")
    
    true_labels_array = np.array([y_test_binary_array[i]])
    predicted_labels_array = np.array([predicted_labels[i]])


    print(f"True Labels: {mlb.inverse_transform(true_labels_array)[0]}")
    print(f"Predicted Labels: {mlb.inverse_transform(predicted_labels_array)[0]}")
    
    print("\n" + "="*50 + "\n")


Example 1 - Text: often wondered restauranteurs heated briquettes brinks rocks placed table traditional korean cookingthen saw place good pic taxi phonecamera actually fire sidewalk right front guy pickin redwhitehot brick patrons inside korea urllink fire hole
True Labels: ('14', 'Aries', 'female', 'indUnk')
Predicted Labels: ('male',)


Example 2 - Text: like hes funny much common cute sick mind likes friends know huh unlike guys actually match super sweet never mad intelligent open almost ok hes blunt witty gay see good ones yes ladys real taken dunno hes didnt think hed like girl well like mean im brighti almost get streight fs im fat im ugly im bitch sometimes im clinicly insane realy seems like lot deal dont think dont know boy sees like dream come true every girl dreams day perfect guy show one day even girl like well mericals happen guess theres hope even girl like
True Labels: ('33', 'Aquarius', 'InvestmentBanking', 'male')
Predicted Labels: ('14', 'Aries', 'female', 'indUnk')