In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv
/kaggle/input/w2v-pretrained/GoogleNews-vectors-negative300.bin


In [2]:
# languange processing imports
import nltk
#nltk.download('punkt')
import re

from sklearn.model_selection import train_test_split

# model imports
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
#import gensim.downloader as api

In [3]:
# find and remove non-ascii words
# Non so se serva, ma si può modificare aggiungendo preprocessing ad-hoc

our_special_word = 'qwerty'

def remove_ascii_words(df):
    """ removes non-ascii characters from the 'texts' column in df.
    It returns the words containig non-ascii characers.
    """
    non_ascii_words = []
    for i in range(len(df)):
        for word in df.loc[i, 'excerpt'].split(' '):
            if any([ord(character) >= 128 for character in word]):
                non_ascii_words.append(word)
                df.loc[i, 'excerpt'] = df.loc[i, 'excerpt'].replace(word, our_special_word)
    return non_ascii_words

In [4]:
# Non so se serva, ma si può modificare aggiungendo preprocessing ad-hoc
def get_good_tokens(sentence):
    replaced_punctation = list(map(lambda token: re.sub('[^0-9A-Za-z!?]+', '', token), sentence))
    removed_punctation = list(filter(lambda token: token, replaced_punctation))
    return removed_punctation

In [5]:
# Here we get transform the documents into sentences for the word2vecmodel
# we made a function such that later on when we make the submission, we don't need to write duplicate code
def w2v_preprocessing(df):
    """ All the preprocessing steps for word2vec are done in this function.
    All mutations are done on the dataframe itself. So this function returns
    nothing.
    """
    df['excerpt'] = df.excerpt.str.lower()
    df['document_sentences'] = df.excerpt.str.split('.')  # split texts into individual sentences
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(map(nltk.word_tokenize, sentences)),
                                         df.document_sentences))  # tokenize sentences
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(map(get_good_tokens, sentences)),
                                         df.tokenized_sentences))  # remove unwanted characters
    df['tokenized_sentences'] = list(map(lambda sentences:
                                         list(filter(lambda lst: lst, sentences)),
                                         df.tokenized_sentences))  # remove empty lists

In [6]:
train_data =  pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
train_data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [7]:
train_data.excerpt = train_data['excerpt'].apply(str)
non_ascii_words = remove_ascii_words(train_data)

print("Replaced {} words with characters with an ordinal >= 128 in the test data.".format(
    len(non_ascii_words)))

Replaced 1327 words with characters with an ordinal >= 128 in the test data.


In [8]:
# Pretrained W2V on Google News
#W2Vmodel = api.load('word2vec-google-news-300')
W2Vmodel = KeyedVectors.load_word2vec_format("/kaggle/input/w2v-pretrained/GoogleNews-vectors-negative300.bin", binary = True)

In [9]:
w2v_preprocessing(train_data)

In [10]:
train_data.drop(train_data[train_data.tokenized_sentences.str.len() == 0].index, inplace= True) 

In [11]:
#create dictionary with all sentences
sentences = []
for sentence_group in train_data.tokenized_sentences:
    sentences.extend(sentence_group)

print("Number of sentences: {}.".format(len(sentences)))
print("Number of texts: {}.".format(len(train_data)))

Number of sentences: 25633.
Number of texts: 2834.


In [12]:
def get_w2v_features(w2v_model, sentence_group):
    """ Transform a sentence_group (containing multiple lists
    of words) into a feature vector. It averages out all the
    word vectors of the sentence_group.
    """
    words = np.concatenate(sentence_group)  # words in text
    index2word_set = set(w2v_model.index_to_key) # set(w2v_model.wv.vocab.keys())  # words known to model
    
    featureVec = np.zeros(w2v_model.vector_size, dtype="float32")
    
    # Initialize a counter for number of words in a review
    nwords = 0
    # Loop over each word in the comment and, if it is in the model's vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            featureVec = np.add(featureVec, w2v_model[word])
            nwords += 1.

    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [13]:
train_data['w2v_features'] = list(map(lambda sen_group:
                                     get_w2v_features(W2Vmodel, sen_group),
                                     train_data.tokenized_sentences))

In [14]:
train_data["w2v_resh_features"] = train_data["w2v_features"].apply(lambda x : x.reshape(1,-1) )

In [15]:
arr_w2v = train_data.w2v_resh_features[0]
for i in range(1, len(train_data)):
    arr_w2v = np.vstack((arr_w2v, train_data.w2v_resh_features[i]))

## Training Set (Text only)

In [16]:
X_train = arr_w2v
y_train = train_data[["target", "standard_error"]].values

### Regression on W2V
Proviamo a vedere se le feature del W2V sono rilevanti per-se 

In [17]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error as mse

### Augmenting W2V

In [18]:
def augment_train_w2v(X_train, y_train, y_std, times = 5):
    
    augmented_w2v_X = X_train.copy()
    augmented_w2v_y = y_train.copy()
    
    for j in range(0, times - 1):
        for i in range(0, X_train.shape[0]):
        
            new_w2v = X_train[i,:] + np.random.uniform(1,8)*1e-4*np.random.randn(300)       # np.random.uniform(1,8)*1e-3*np.random.randn(300)
            augmented_w2v_X = np.vstack((augmented_w2v_X, new_w2v))
            
            new_y = y_train[i] + np.random.choice([-1, 1])*y_std[i]*0.05
            augmented_w2v_y = np.append(augmented_w2v_y, new_y)
        
    return augmented_w2v_X, augmented_w2v_y

In [19]:
aug_X_train, aug_Y_train = augment_train_w2v(X_train, y_train[:,0], y_train[:,1], times = 5)

In [20]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [21]:
aug_X_train, aug_Y_train = unison_shuffled_copies(aug_X_train, aug_Y_train)

In [22]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 1000, verbosity = 1)

xg_reg.fit(aug_X_train, aug_Y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=1)

# Test Data

In [23]:
test_data =  pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
test_data.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [24]:
test_data.excerpt = test_data['excerpt'].apply(str)
non_ascii_words = remove_ascii_words(test_data)

print("Replaced {} words with characters with an ordinal >= 128 in the test data.".format(
    len(non_ascii_words)))

Replaced 0 words with characters with an ordinal >= 128 in the test data.


In [25]:
w2v_preprocessing(test_data)

In [26]:
test_data.drop(test_data[test_data.tokenized_sentences.str.len() == 0].index, inplace= True) 

In [27]:
#create dictionary with all sentences
sentences_test = []
for sentence_group in test_data.tokenized_sentences:
    sentences_test.extend(sentence_group)

print("Number of sentences: {}.".format(len(sentences)))
print("Number of texts: {}.".format(len(test_data)))

Number of sentences: 25633.
Number of texts: 7.


In [28]:
test_data['w2v_features'] = list(map(lambda sen_group:
                                     get_w2v_features(W2Vmodel, sen_group),
                                     test_data.tokenized_sentences))

In [29]:
test_data["w2v_resh_features"] = test_data["w2v_features"].apply(lambda x : x.reshape(1,-1) )

In [30]:
arr_w2v_test = test_data.w2v_resh_features[0]
for i in range(1, len(test_data)):
    arr_w2v_test = np.vstack((arr_w2v_test, test_data.w2v_resh_features[i]))

In [31]:
y_test = xg_reg.predict(arr_w2v_test)

In [32]:
predictions = pd.DataFrame()
predictions['id'] = test_data['id']
predictions['target'] = y_test
predictions.to_csv("submission.csv", index=False)
predictions

Unnamed: 0,id,target
0,c0f722661,-0.93992
1,f0953f0a5,-0.403884
2,0df072751,-0.67597
3,04caf4e0c,-2.002202
4,0e63f8bea,-1.977105
5,12537fe78,-0.98031
6,965e592c0,0.138798


In [33]:
!ls

__notebook__.ipynb  submission.csv
