The dataset for this BINARY CLASSIFIER to be used is [bc-dataset.csv].
There are 2 implementations here:
1. **Bag of words approach.**
2. Word vectors (can be pre-trained word embeddings).

The dataset split is 60-40.
Evaluation metrics to be used in this are:
1. Precision.
2. Recall.
3. F-Measure.


References:
1. https://www.kaggle.com/homayoonkhadivi/twitter-gender-classification-high-acc-69-56-nlp#I-gained-the-top-high-accuracy-with-LogisticRegression-algorithm-for-this-problem 
2. https://www.kaggle.com/evilport/classify-gender-with-description-and-text
3. https://www.kaggle.com/orhansertkaya/natural-language-processing-nlp

Loads the dataset.

In [1]:
from ftfy import fix_encoding
import pandas as pd
import random
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.utils import shuffle
import numpy as np
import re
import csv
import nltk as nlp
from nltk.corpus import stopwords

nlp.download("stopwords")
nlp.download('punkt')
nlp.download('wordnet')

def fix_encode(x):
    return fix_encoding(x)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

df = pd.read_csv(
    "datasets/bc-dataset.csv",
    encoding="latin1",
    sep=",",
    quoting=csv.QUOTE_ALL
)

data = pd.concat([df.gender, df['gender:confidence'], df.text], axis=1)

#drop null rows
print("Data Shape: " + str(data.shape))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data Shape: (20050, 3)


Shows some descriptive stats (just for fun).
Also applies ftfy encoding to the ['text'] Column to fix any broken encodings when the .csv file was loaded.

In [2]:
print("Data Shape: " + str(data.shape))
print("Data Columns: " + str(data.columns))

# print("Just some stats.")
# print("------")
# print(data['gender'].describe())
# print("------")
# print(data['gender'].value_counts(ascending=True))

Data Shape: (20050, 3)
Data Columns: Index(['gender', 'gender:confidence', 'text'], dtype='object')


UNKNOWN and NAN values in *gender* column are discarded.

In [3]:
data.gender = [0 if gender == 'female' else 1 for gender in data.gender]

total female tweets:  6700
total male tweets:    6194
total brand tweets:   5942


In [1]:
# for testing purposes
# data.iloc[231]
data.columns

NameError: name 'data' is not defined

In [7]:
print(str(data.shape))
data.dropna(subset=['text', 'gender'], inplace=True)
data = data.reset_index(drop=True)

data.text = data.text.apply(lambda x: fix_encode(x))

# Data Shape: (20050, 26)
# Data Shape: (16306, 26)
print(str(data.shape))

(18836, 3)
(18836, 3)


In [8]:
import string 

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

data['text']=data['text'].apply(lambda x : remove_URL(x))
data['text']=data['text'].apply(lambda x : remove_html(x))
data['text']=data['text'].apply(lambda x : remove_emoji(x))
data['text']=data['text'].apply(lambda x : remove_punct(x))

In [11]:
print("ORIGINAL: ", data.text.iloc[1237])

ORIGINAL:  High marks in the grade cards for these 2 cuties  Great job 


In [12]:
data.drop(data[data['gender:confidence'] < 0.80].index, inplace=True)
data.drop('gender:confidence', axis=1, inplace=True)
# data.dropna(subset=['gender'], inplace=True)
print(str(data.shape))

(13817, 2)


In [13]:
lemma = nlp.WordNetLemmatizer()
description_list = []   # empty list
tweet_list = []   # empty list

# should pronouns be counted as stopwords?

for each in data.text:
    each = re.sub("[^a-zA-Z]"," ", str(each))                                        # regex to clean unnecesarry chars
    each = each.lower()                                                              # lowercase all
    each = nlp.word_tokenize(each)                                                   # split all by tokenizing
    each = [word for word in each if not word in set(stopwords.words("english"))]    # delete stop words from your array
    each = [lemma.lemmatize(word) for word in each]                                  # lemmatize "memories" -> "memory"
    each = " ".join(each)                                                            # make them one string again
    tweet_list.append(each)                                                         # put them into big array

print("ORIGINAL: ", data.text.iloc[123])
print("PREPROCESSED: ", tweet_list[123])

ORIGINAL:  Girl of the month  
PREPROCESSED:  girl month


In [14]:
print(str(data.shape))

(13817, 2)


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# arbitrarily set it 
MAX_FEATURES = 5000

count_vectorizer = CountVectorizer(max_features=MAX_FEATURES, stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(tweet_list).toarray()
words = count_vectorizer.get_feature_names()

In [16]:
print(words[:30])

['ab', 'abba', 'abbott', 'abc', 'ability', 'able', 'abortion', 'abraham', 'absolute', 'absolutely', 'abt', 'abu', 'abuse', 'ac', 'academic', 'academy', 'acc', 'accent', 'accept', 'acceptable', 'accepted', 'accepting', 'access', 'accessing', 'accessory', 'accident', 'accidentally', 'accomplishment', 'according', 'account']


In [17]:
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, stop_words='english')
sparse_matrix_tfidf = tfidf_vectorizer.fit_transform(tweet_list).toarray()
words_tfidf = tfidf_vectorizer.get_feature_names()

In [18]:
print(words_tfidf[:30])

['ab', 'abba', 'abbott', 'abc', 'ability', 'able', 'abortion', 'abraham', 'absolute', 'absolutely', 'abt', 'abu', 'abuse', 'ac', 'academic', 'academy', 'acc', 'accent', 'accept', 'acceptable', 'accepted', 'accepting', 'access', 'accessing', 'accessory', 'accident', 'accidentally', 'accomplishment', 'according', 'account']


In [19]:
from sklearn.model_selection import train_test_split
# print(words[:100])

X = sparse_matrix
y = data.gender.values

X_tfidf = sparse_matrix_tfidf

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.4)

In [20]:
# metrics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

In [21]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred_nb = nb_classifier.predict(X_test)

print(classification_report(y_test, y_pred_nb, digits=5))
print("Mean accuracy: ", nb_classifier.score(X_test, y_test))
print("WEIGHTED F-measure: ", f1_score(y_test, y_pred_nb, average="weighted"))
print("WEIGHTED Precision: ", precision_score(y_test, y_pred_nb, average="weighted"))
print("WEIGHTED Recall: ", recall_score(y_test, y_pred_nb, average="weighted"))

              precision    recall  f1-score   support

           0    0.54689   0.65720   0.59699      2112
           1    0.46953   0.42372   0.44545      1855
           2    0.65779   0.55449   0.60174      1560

    accuracy                        0.54985      5527
   macro avg    0.55807   0.54513   0.54806      5527
weighted avg    0.55223   0.54985   0.54747      5527

Mean accuracy:  0.549846209516917
WEIGHTED F-measure:  0.5474701894393375
WEIGHTED Precision:  0.5522293040760015
WEIGHTED Recall:  0.549846209516917


In [22]:
nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, y_train_tfidf)

y_pred_nb_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)

print(classification_report(y_test_tfidf, y_pred_nb_tfidf, digits=5))
print("Mean accuracy: ", nb_classifier_tfidf.score(X_test_tfidf, y_test_tfidf))
print("WEIGHTED F-measure: ", f1_score(y_test_tfidf, y_pred_nb_tfidf, average="weighted"))
print("WEIGHTED Precision: ", precision_score(y_test_tfidf, y_pred_nb_tfidf, average="weighted"))
print("WEIGHTED Recall: ", recall_score(y_test_tfidf, y_pred_nb_tfidf, average="weighted"))


              precision    recall  f1-score   support

           0    0.51598   0.74482   0.60963      2124
           1    0.48127   0.36339   0.41411      1874
           2    0.70650   0.48332   0.57398      1529

    accuracy                        0.54315      5527
   macro avg    0.56792   0.53051   0.53257      5527
weighted avg    0.55692   0.54315   0.53348      5527

Mean accuracy:  0.543151800253302
WEIGHTED F-measure:  0.5334750207805831
WEIGHTED Precision:  0.5569185913089071
WEIGHTED Recall:  0.543151800253302


Default settings.

In [None]:
from sklearn.metrics import accuracy_score

SVM = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(X_train,y_train)

predictions_SVM = SVM.predict(X_test)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

In [46]:
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

y_pred_gnb = gnb_classifier.predict(X_test)

print(classification_report(y_test, y_pred_nb, digits=5))
print("Mean accuracy: ", gnb_classifier.score(X_test, y_test))
print("WEIGHTED F-measure: ", f1_score(y_test, y_pred_nb, average="weighted"))
print("WEIGHTED Precision: ", precision_score(y_test, y_pred_gnb, average="weighted"))
print("WEIGHTED Recall: ", recall_score(y_test, y_pred_gnb, average="weighted"))

              precision    recall  f1-score   support

           0    0.53722   0.64486   0.58614      2171
           1    0.46228   0.40594   0.43228      1887
           2    0.62342   0.53642   0.57666      1469

    accuracy                        0.53447      5527
   macro avg    0.54097   0.52907   0.53169      5527
weighted avg    0.53455   0.53447   0.53109      5527

Mean accuracy:  0.45286774018454856
WEIGHTED F-measure:  0.5310898368047072
WEIGHTED Precision:  0.4778008853899147
WEIGHTED Recall:  0.45286774018454856


In [36]:
lr_classifier = LogisticRegression(max_iter=5000)
lr_classifier.fit(X_train, y_train)

y_pred_lr = lr_classifier.predict(X_test)

print(classification_report(y_test, y_pred_lr, digits=5))
print("Mean accuracy: ", lr_classifier.score(X_test, y_test))
print("WEIGHTED F-measure: ", f1_score(y_test, y_pred_lr, average="weighted"))
print("WEIGHTED Precision: ", precision_score(y_test, y_pred_lr, average="weighted"))
print("WEIGHTED Recall: ", recall_score(y_test, y_pred_lr, average="weighted"))

              precision    recall  f1-score   support

           0    0.52846   0.59880   0.56143      2171
           1    0.44501   0.41600   0.43002      1887
           2    0.59094   0.52417   0.55556      1469

    accuracy                        0.51656      5527
   macro avg    0.52147   0.51299   0.51567      5527
weighted avg    0.51657   0.51656   0.51500      5527

Mean accuracy:  0.5165550931789398
WEIGHTED F-measure:  0.5150045420458254
WEIGHTED Precision:  0.5165749081319531
WEIGHTED Recall:  0.5165550931789398


In [37]:
rfc_classifier = RandomForestClassifier(n_estimators = 100)
rfc_classifier.fit(X_train, y_train)

y_head_rfc = rfc_classifier.predict(X_test)

print(classification_report(y_test, y_head_rfc, digits=5))
print("Mean accuracy: ", rfc_classifier.score(X_test, y_test))
print("WEIGHTED F-measure: ", f1_score(y_test, y_head_rfc, average="weighted"))
print("WEIGHTED Precision: ", precision_score(y_test, y_head_rfc, average="weighted"))
print("WEIGHTED Recall: ", recall_score(y_test, y_head_rfc, average="weighted"))

              precision    recall  f1-score   support

           0    0.51171   0.59374   0.54968      2171
           1    0.42167   0.39799   0.40949      1887
           2    0.61043   0.50987   0.55564      1469

    accuracy                        0.50461      5527
   macro avg    0.51461   0.50053   0.50494      5527
weighted avg    0.50721   0.50461   0.50340      5527

Mean accuracy:  0.5046137144924914
WEIGHTED F-measure:  0.5033997976294795
WEIGHTED Precision:  0.5072094202377095
WEIGHTED Recall:  0.5046137144924914
