The dataset for this BINARY CLASSIFIER to be used is [bc-dataset.csv].
There are 2 implementations here:
1. Bag of words approach.
2. **Word vectors (can be pre-trained word embeddings).**

The dataset split is 60-40.
Evaluation metrics to be used in this are:
1. Precision.
2. Recall.
3. F-Measure.

Loads the dataset.

In [None]:
#For GloVe
root_folder = ''
data_folder = 'data'
glove_filename = 'glove.42B.300d.txt'

In [1]:
from ftfy import fix_encoding
import pandas as pd
import numpy as np
import re
import csv
from spellchecker import SpellChecker
import string
import nltk as nlp
from nltk.corpus import stopwords

# nltk.download("stopwords")
# nltk.download('punkt')
# nltk.download('wordnet')

stop_words = stopwords.words("english")
spell = SpellChecker()

def correct_spellings(x, spell = spell):
    x = x.split()
    misspelled = spell.unknown(x)
    result = map(lambda word: spell.correction(word) if word in misspelled else word, x)
    return " ".join(result)

def tweet_cleaning(x, correct_spelling=True, remove_emojis=True, remove_stop_words=True):
    x = x.lower().strip()
    
    #remove URLs
    url = re.compile(r'https?://\S+|www\.\S+')
    x = url.sub(r'',x)

    #remove HTML tags
    html = re.compile(r'<.*?>')
    x = html.sub(r'',x)

    #strip punctuation
    operator = str.maketrans('','',string.punctuation)
    x = x.translate(operator)

    if correct_spelling:
        x = correct_spellings(x)

    if remove_emojis:
        x = x.encode('ascii', 'ignore').decode('utf8').strip()
    
    if remove_stop_words:
        x = ' '.join([word for word in x.split(' ') if word not in stop_words])

def fix_encode(x):
    return fix_encoding(x)

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

df = pd.read_csv(
    "datasets/bc-dataset.csv",
    encoding="latin1",
    sep=",",
    quoting=csv.QUOTE_ALL
)

data = pd.concat([df.gender, df.description, df.text], axis=1)

#drop null rows
data.dropna(axis=0, inplace=True)

Fixes any encoding errors and applies a cleaning function on the text column.

Note: shouldn't I be applying both to both columns?

In [5]:
# fixes any broken encodings
data.description = data.description.apply(lambda x: fix_encode(x))

# apply the cleaning function
data.text = data.text.apply(tweet_cleaning)

Some fun stats.

In [None]:
print("Shape: " + str(data.shape))

print("Just some stats.")
print("------")
print(data["gender"].describe())
print("------")
print(data["gender"].value_counts(ascending=True))

In [None]:
get_female = data["gender"] == "female"
get_male = data["gender"] == "male"
get_brand = data["gender"] == "brand"

female_rows = data[get_female]
male_rows = data[get_male]
brand_rows = data[get_brand]

print("total female tweets: ",female_rows.description.count())
print("total male tweets:   ",male_rows.description.count())
print("total brand tweets:  ",brand_rows.description.count()) 

In [None]:
female_rows.gender = 0     # female
male_rows.gender = 1       # male
brand_rows.gender = 2      # brand

In [None]:
frames = [female_rows, male_rows, brand_rows]
data = pd.concat(frames, ignore_index=True)

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

#glove_input_file = glove_filename
word2vec_output_file = glove_filename+'.word2vec'
glove2word2vec(glove_path, word2vec_output_file)

In [None]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
word2vec_output_file = glove_filename+'.word2vec'
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

#Show a word embedding
print('King: ',model.get_vector('king'))

result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

print('Most similar word to King + Woman: ', result)

In [None]:
class Word2VecVectorizer:
    def __init__(self, model):
        print("Loading in word vectors...")
        self.word_vectors = model
        print("Finished loading in word vectors")

    def fit(self, data):
        pass

    def transform(self, data):
        # determine the dimensionality of vectors
        v = self.word_vectors.get_vector('king')
        self.D = v.shape[0]

        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
        tokens = sentence.split()
        vecs = []
        m = 0
        for word in tokens:
            try:
            # throws KeyError if word not found
            vec = self.word_vectors.get_vector(word)
            vecs.append(vec)
            m += 1
            except KeyError:
            pass
        if len(vecs) > 0:
            vecs = np.array(vecs)
            X[n] = vecs.mean(axis=0)
        else:
            emptycount += 1
        n += 1
        print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X


    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
   

In [None]:
vectorizer = Word2VecVectorizer(model)

y = data.gender.values
X = vectorizer.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

print(X_train.shape, X_test.shape)

RFC.

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimates=200)
clf.fit(X_train, y_train)

print("train score:", clf.score(Xtrain, Ytrain))
print("test score:", clf.score(Xtest, Ytest))

In [None]:
from sklearn.metrics import classification_report

y_pred_rfc = clf.predict(X_test)

print(metrics.classification_report(y_test, y_pred_rfc,  digits=5))

SVM.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

PARAMETERS = {
    'C':[1.0, 10],
    'gamma':[1, 'auto', 'scale']
}

model = GridSearchCV(SVC(kernel='rbf'), PARAMETERS, cv=5, n_jobs=1).fit(X_train, y_train)

In [None]:
y_pred_svm = model.predict(X_test)

print(metrics.classification_report(y_test, y_pred_svm,  digits=5))

XGB.

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

def f1_metric(ytrue,preds):
    return 'f1_score', f1_score((preds>=0.5).astype('int'), ytrue, average='macro'), True

PARAMETERS_XGB = {
    'learning_rate': 0.06,
    'n_estimators': 1500,
    'colsample_bytree': 0.5,
    'metric': 'f1_score'
}

full_clf = LGBMClassifier(**PARAMETERS_XGB)

full_clf.fit(X_train.astype(np.float32), y_train, eval_set=[(X_train.astype(np.float32), y_train), (X_test.astype(np.float32), y_test)],
                verbose = 400, eval_metric=f1_metric)

print("train score:", full_clf.score(X_train.astype(np.float32), y_train))
print("test score:", full_clf.score(X_test.astype(np.float32), y_test))

In [None]:
y_pred_xgb = full_clf.predict(X_test.astype(np.float32))

print(metrics.classification_report(y_test, y_pred_xgb,  digits=5))