In [0]:
import os
import json
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
import torch
import torchtext
from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import spacy
import en_core_web_sm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
import time

path = "/content/drive/My Drive/Colab Notebooks/data/"


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
manual_seed = 77
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

cuda


# Logistic Regression Baseline

In [0]:
# reading in 100k data
df = pd.read_csv(path+"subset_100k.csv", index_col=0, encoding="utf-8").reset_index(drop=True)
df_shuffled = df.sample(frac=1, random_state=123) #shuffle rows randomly
df_shuffled = df_shuffled.drop(columns="index") #drops index to only keep text and label
train, validate, test = np.split(df.sample(frac=1, random_state=123).drop(columns="index"), 
                                                                          [int(.6*len(df)), int(.8*len(df))])

train_texts, train_labels = zip(*train.values) #resulting type is tuples
valid_texts, valid_labels = zip(*validate.values)
test_texts, test_labels = zip(*test.values)

In [0]:
#create train, val, test subsets from 100k subset
train.to_csv(path+"subset_100k_train.csv", index=False)
validate.to_csv(path+"subset_100k_valid.csv", index=False)
test.to_csv(path+"subset_100k_test.csv", index=False)

In [0]:
# reading in 1k data
df = pd.read_csv(path+"subset_1k.csv", index_col=0, encoding="utf-8").reset_index(drop=True)

df_shuffled = df.sample(frac=1, random_state=123) #shuffle rows randomly
df_shuffled = df_shuffled.drop(columns="index") #drops index to only keep text and label
train, validate, test = np.split(df.sample(frac=1, random_state=123).drop(columns="index"), 
                                                                          [int(.6*len(df)), int(.8*len(df))])

train_texts, train_labels = zip(*train.values) #resulting type is tuples
valid_texts, valid_labels = zip(*validate.values)
test_texts, test_labels = zip(*test.values)

In [0]:
#create train, val, test subsets from 1k subset
train.to_csv(path+"subset_1k_train.csv", index=False)
validate.to_csv(path+"subset_1k_valid.csv", index=False)
test.to_csv(path+"subset_1k_test.csv", index=False)

In [7]:
print(len(train))
print(len(validate))
print(len(test))

print(len(train_texts))
print(len(valid_texts))
print(len(test_texts))

60000
20000
20000
60000
20000
20000


In [0]:
#1k subset

n_jobs=None
verbose=False

vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=2**21)
train_features = vect.fit_transform(train_texts)
valid_features = vect.transform(valid_texts)
test_features = vect.transform(test_texts)

model = LogisticRegression(solver='liblinear')
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]} #changed from original code
search = GridSearchCV(model, params, cv=5, n_jobs=n_jobs, verbose=verbose)
search.fit(sparse.vstack([train_features, valid_features]), train_labels+valid_labels)
print(search.best_params_)
model = model.set_params(**search.best_params_)
model.fit(train_features, train_labels)
valid_accuracy = model.score(valid_features, valid_labels)*100.
test_accuracy = model.score(test_features, test_labels)*100.
data = {
    'valid_accuracy':valid_accuracy,
    'test_accuracy':test_accuracy
}
print(data)

{'C': 10}
{'valid_accuracy': 65.5, 'test_accuracy': 63.0}


In [0]:
#100k subset

n_jobs=None
verbose=False

vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=2**21)
train_features = vect.fit_transform(train_texts)
valid_features = vect.transform(valid_texts)
test_features = vect.transform(test_texts)

model = LogisticRegression(solver='liblinear')
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]} #changed from original code
search = GridSearchCV(model, params, cv=5, n_jobs=n_jobs, verbose=verbose)
search.fit(sparse.vstack([train_features, valid_features]), train_labels+valid_labels)
print(search.best_params_)
model = model.set_params(**search.best_params_)
model.fit(train_features, train_labels)
valid_accuracy = model.score(valid_features, valid_labels)*100.
test_accuracy = model.score(test_features, test_labels)*100.
data = {
    'valid_accuracy':valid_accuracy,
    'test_accuracy':test_accuracy
}
print(data)

{'C': 100}
{'valid_accuracy': 88.23, 'test_accuracy': 88.205}
