In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
import gensim.downloader as gensim
from nltk.tokenize import word_tokenize

In [9]:
x_train = pd.read_table('train/in.tsv', sep='\t', header = None, error_bad_lines = False, quoting = 3)
y_train = pd.read_table('train/expected.tsv', sep='\t', header = None, quoting = 3)
y_train = y_train[0]
x_dev = pd.read_table('dev-0/in.tsv', sep='\t', header = None, quoting = 3)
x_test = pd.read_table('test-A/in.tsv', sep='\t', header = None, quoting = 3)

x_train = x_train[0].str.lower()
x_dev = x_dev[0].str.lower()
x_test = x_test[0].str.lower()

x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]

word2vec = gensim.load('glove-wiki-gigaword-50')

def document_vector(doc):
    return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(50)], axis=0)

x_train = [document_vector(doc) for doc in x_train]
x_dev = [document_vector(doc) for doc in x_dev]
x_test = [document_vector(doc) for doc in x_test]

In [20]:
# Linear Regression

model = LinearRegression()
model.fit(x_train, y_train)

y_dev = model.predict(x_dev)
y_test = model.predict(x_test)
        
Y_dev = pd.DataFrame({'label':y_dev})
Y_test = pd.DataFrame({'label':y_test})

Y_dev['label'] = Y_dev['label'].apply(lambda x: 0 if x < 0 else x)
Y_test['label'] = Y_test['label'].apply(lambda x: 0 if x < 0 else x)

Y_dev['label'] = Y_dev['label'].apply(lambda x: 1 if x > 1 else x)
Y_test['label'] = Y_test['label'].apply(lambda x: 1 if x > 1 else x)

Y_dev.to_csv(r'dev-0/linear_out.tsv', sep='\t', index=False,  header=False)
Y_test.to_csv(r'test-A/linear_out.tsv', sep='\t', index=False,  header=False)

In [28]:
# Logistic Regression

model = LogisticRegression(solver='lbfgs', max_iter=100000)
model.fit(x_train, y_train)

y_dev = model.predict(x_dev)
y_test = model.predict(x_test)
        
Y_dev = pd.DataFrame({'label':y_dev})
Y_test = pd.DataFrame({'label':y_test})

Y_dev.to_csv(r'dev-0/logistic_out.tsv', sep='\t', index=False,  header=False)
Y_test.to_csv(r'test-A/logistic_out.tsv', sep='\t', index=False,  header=False)

In [29]:
# SGDCLassifier

model = SGDClassifier(max_iter=100000)
model.fit(x_train, y_train)

y_dev = model.predict(x_dev)
y_test = model.predict(x_test)
        
Y_dev = pd.DataFrame({'label':y_dev})
Y_test = pd.DataFrame({'label':y_test})

Y_dev.to_csv(r'dev-0/SGD_out.tsv', sep='\t', index=False,  header=False)
Y_test.to_csv(r'test-A/SGD_out.tsv', sep='\t', index=False,  header=False)