In [None]:
# LOAD CLASS AND DATASET 
# dataset: https://www.kaggle.com/rmisra/news-category-dataset
# start:    each doc in corpus contains an article link, category and other (irrelevant) key/value pairs
# end goal: each doc in corpus contains a sentence with POS tagging and gender polarity, and label vector with actual gender
import sys
sys.path.append('../')

%reload_ext autoreload
%autoreload 2
from classes.data_prepper import DataPrepper

p = DataPrepper()

import pandas as pd

df = pd.read_json('../datasets/8_dataset_simple.json')
df['label'] = df.Gender.apply(lambda x: 0 if x == "M" else 1)
df = df.drop('Gender', axis=1)
df = df.rename(columns={'Text': 'text'})


In [21]:
df = df[:20000]

In [23]:
split = round(len(df)*0.6)
imdb_train = df[:split]
imdb_test = df[split:]

In [24]:

import pandas as pd
import re
from os import system, listdir
from os.path import isfile, join
from random import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices

system("mkdir data_preprocessors")
system("mkdir vectorized_data")


# Unigram Counts

unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(imdb_train['text'].values)

dump(unigram_vectorizer, 'data_preprocessors/unigram_vectorizer.joblib')

# unigram_vectorizer = load('data_preprocessors/unigram_vectorizer.joblib')

X_train_unigram = unigram_vectorizer.transform(imdb_train['text'].values)

save_npz('vectorized_data/X_train_unigram.npz', X_train_unigram)

# X_train_unigram = load_npz('vectorized_data/X_train_unigram.npz')


# Unigram Tf-Idf

unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)

dump(unigram_tf_idf_transformer, 'data_preprocessors/unigram_tf_idf_transformer.joblib')

# unigram_tf_idf_transformer = load('data_preprocessors/unigram_tf_idf_transformer.joblib')

X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)

save_npz('vectorized_data/X_train_unigram_tf_idf.npz', X_train_unigram_tf_idf)

# X_train_unigram_tf_idf = load_npz('vectorized_data/X_train_unigram_tf_idf.npz')


# Bigram Counts

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(imdb_train['text'].values)

dump(bigram_vectorizer, 'data_preprocessors/bigram_vectorizer.joblib')

# bigram_vectorizer = load('data_preprocessors/bigram_vectorizer.joblib')

X_train_bigram = bigram_vectorizer.transform(imdb_train['text'].values)

save_npz('vectorized_data/X_train_bigram.npz', X_train_bigram)

# X_train_bigram = load_npz('vectorized_data/X_train_bigram.npz')


# Bigram Tf-Idf

bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)

dump(bigram_tf_idf_transformer, 'data_preprocessors/bigram_tf_idf_transformer.joblib')

# bigram_tf_idf_transformer = load('data_preprocessors/bigram_tf_idf_transformer.joblib')

X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

save_npz('vectorized_data/X_train_bigram_tf_idf.npz', X_train_bigram_tf_idf)

# X_train_bigram_tf_idf = load_npz('vectorized_data/X_train_bigram_tf_idf.npz')

In [33]:
imdb_train['text'].values

array(['At least two organizations have decided to drop Morgan Freeman after eight women accused him of inappropriate behavior and sexual harassment ',
       'Women who previously worked with the Oscar winning actor told CNN that he repeatedly made comments about their bodies or their clothing and frequently engaged in inappropriate touching ',
       'In response to the allegations Visa announced it had suspended him from its marketing campaign ',
       ...,
       'People hate Cavaliers owners Dan Gilbert enough that think people won be mad if LeBron leaves ',
       'Or am wrong ',
       'Travis Waldron We ve got LeBron James blowing up the entire league and you want to talk about Julius Randle '],
      dtype=object)

In [25]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

y_train = imdb_train['label'].values

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_bigram, y_train, 'Bigram Counts')
train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Unigram Counts
Train score: 0.92 ; Validation score: 0.79

Unigram Tf-Idf
Train score: 0.89 ; Validation score: 0.79

Bigram Counts
Train score: 0.94 ; Validation score: 0.78

Bigram Tf-Idf
Train score: 0.93 ; Validation score: 0.79



In [26]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

X_train = X_train_bigram_tf_idf


# Phase 1: loss, learning rate and initial learning rate

clf = SGDClassifier()

distributions = dict(
    loss=['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    learning_rate=['optimal', 'invscaling', 'adaptive'],
    eta0=uniform(loc=1e-7, scale=1e-2)
)

random_search_cv = RandomizedSearchCV(
    estimator=clf,
    param_distributions=distributions,
    cv=5,
    n_iter=50
)
random_search_cv.fit(X_train, y_train)
print(f'Best params: {random_search_cv.best_params_}')
print(f'Best score: {random_search_cv.best_score_}')

Best params: {'eta0': 0.007390054953427734, 'learning_rate': 'adaptive', 'loss': 'modified_huber'}
Best score: 0.73575


In [27]:
# Phase 2: penalty and alpha

clf = SGDClassifier()

distributions = dict(
    penalty=['l1', 'l2', 'elasticnet'],
    alpha=uniform(loc=1e-6, scale=1e-4)
)

random_search_cv = RandomizedSearchCV(
    estimator=clf,
    param_distributions=distributions,
    cv=5,
    n_iter=50
)
random_search_cv.fit(X_train, y_train)
print(f'Best params: {random_search_cv.best_params_}')
print(f'Best score: {random_search_cv.best_score_}')

Best params: {'alpha': 4.04808581253838e-05, 'penalty': 'l1'}
Best score: 0.7341666666666666


In [28]:
system("mkdir classifiers")

sgd_classifier = random_search_cv.best_estimator_

dump(random_search_cv.best_estimator_, 'classifiers/sgd_classifier.joblib')

# sgd_classifier = load('classifiers/sgd_classifier.joblib')

['classifiers/sgd_classifier.joblib']

In [41]:

X_test = bigram_vectorizer.transform(imdb_test['text'].values)
X_test = bigram_tf_idf_transformer.transform(X_test)
y_test = imdb_test['label'].values

score = sgd_classifier.score(X_test, y_test)
y_pred = sgd_classifier.predict(X_test)
print(score)
print(y_pred)

0.635375
[0 0 0 ... 0 0 0]


In [42]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[3251,  579],
       [2338, 1832]], dtype=int64)