In [40]:
import os
import numpy as np 
import pandas as pd 

# for preprocessing 
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
nltk.download('stopwords')
nltk.download('punkt')


#for model training
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

# from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
for dirname, _, filenames in os.walk('./kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [35]:
DATASET_DIR = './kaggle/input/'
TrainX = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
Trainy = TrainX['domain1_score']
TrainX = TrainX.dropna(axis=1)
TrainX = TrainX.drop(columns=['rater1_domain1', 'rater2_domain1'])
TrainX.head()
# print(Trainy)

TestX = pd.read_csv(os.path.join('test_set.tsv'), sep='\t', encoding='ISO-8859-1')
Testy = TestX['domain1_predictionid']
TestX = TestX.dropna(axis=1)
# TestX = TestX.drop(columns=['rater1_domain1', 'rater2_domain1'])
TestX.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_predictionid
0,2383,1,I believe that computers have a positive effec...,2383
1,2384,1,"Dear @CAPS1, I know some problems have came up...",2384
2,2385,1,"Dear to whom it @MONTH1 concern, Computers are...",2385
3,2386,1,"Dear @CAPS1 @CAPS2, @CAPS3 has come to my atte...",2386
4,2387,1,"Dear Local newspaper, I think that people have...",2387


In [42]:
# Pre-processing Data
# Helper functions used to clean the essays

extra_words = ['us','whose', 'ORGANIZATION', 'PEOPLE', 'LOCATION', 'DATE', 'CAPS',
                   'NUM', 'MONTH', 'YEAR', 'PERCENT', 'TIME', 'MONEY', 'QUANTITY', 'LANGUAGE']

def essayToWords(essay, remove, extras):
    """Remove the tagged labels and word tokenize the sentence."""
    # chars removed
    # |([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
    essay = re.sub("[^a-zA-Z]", " ", essay)
    words = essay.lower().split()
    # stop words removed
    if remove:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # extra words removed
        words = [w for w in words if not w in extra_words]
    return (words)


def essayToSentences(essay, remove, extras):
    """Tokenize the essay into sentences and call essayToWords() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay.strip())
    sentences = []
    for sente in raw_sentences:
        if len(sente) > 0:
            sentences.append(essayToWords(sente, remove, extras))
    return sentences

# visualising
print(TrainX['essay'][0])
print('Words', essayToWords(TrainX['essay'][0], True, extra_words ))
print('Sentences', essayToSentences(TrainX['essay'][0], True, extra_words ))


Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the economy

In [None]:

## Get feature vectors
# def getFeatureVect(words, model, num_features):
#     """Make Feature Vector from the words list of an Essay."""
#     featureVec = np.zeros((num_features,),dtype="float32")
#     num_words = 0.
#     index2word_set = set(model.wv.index2word)
#     for word in words:
#         if word in index2word_set:
#             num_words += 1
#             featureVec = np.add(featureVec,model[word])        
#     featureVec = np.divide(featureVec,num_words)
#     return featureVec

# def getAvgFeatureVecs(essays, model, num_features):
#     """Main function to generate the word vectors for word2vec model."""
#     counter = 0
#     essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
#     for essay in essays:
#         essayFeatureVecs[counter] = getFeatureVect(essay, model, num_features)
#         counter = counter + 1
#     return essayFeatureVecs

In [None]:
# Model
def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()

    return model

In [None]:
# Training settings
batch_size = 16
epochs = 15
learningRate = 1e-3
num_features = 300 

In [None]:
# Training - In Works
epoch_num = []
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []
for epoch in range(epochs):

  train_essays = TrainX['essay']
  test_essays = TestX['essay']
  epoch_num.append(epoch)
  epoch_loss = 0
  epoch_accuracy = 0

  sentences = []

  for essay in train_essays:
    # Obtaining all sentences from the training essays.
    sentences += essayToSentences(essay, remove_stopwords = True)

In [None]:
# training
# cv = KFold(n_splits = 5, shuffle = True)
# for traincv, testcv in cv.split(X):

#   print(traincv)
#   print(testcv)
#just indexes of splits


[    0     1     2 ... 12973 12974 12975]
[    7     8    13 ... 12956 12958 12960]
