Exploring several features

In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from source.features import *

In [2]:
RESOURCES_DIR = Path('../resources')
DATASETS_PATH = RESOURCES_DIR / "datasets"
WORD_EMBEDDINGS_NAME = "glove.42B.300d"
DUMPS_DIR = RESOURCES_DIR / "DUMPS"

In [3]:
sentences_pair = {"simple_text": "esta es la version complicada", "original_text": "esta es la version complicada"}

In [4]:
char = CharLengthRatio()
sentences_pair = char.get_ratio(sentences_pair)
sentences_pair

{'simple_text': 'esta es la version complicada',
 'original_text': 'esta es la version complicada',
 'original_text_preprocessed': 'CLR_1.0 ',
 'CLR': 1.0}

In [5]:
char.name

'CLR'

In [6]:
word_length = WordLengthRatio()
sentences_pair = word_length.get_ratio(sentences_pair)
sentences_pair

{'simple_text': 'esta es la version complicada',
 'original_text': 'esta es la version complicada',
 'original_text_preprocessed': 'CLR_1.0 WLR_1.0 ',
 'CLR': 1.0,
 'WLR': 1.0}

In [7]:
levenshtein = LevenshteinRatio()
sentences_pair = levenshtein.get_ratio(sentences_pair)
sentences_pair

{'simple_text': 'esta es la version complicada',
 'original_text': 'esta es la version complicada',
 'original_text_preprocessed': 'CLR_1.0 WLR_1.0 LR_1.0 ',
 'CLR': 1.0,
 'WLR': 1.0,
 'LR': 1.0}

In [8]:
dependency = DependencyTreeDepthRatio()
sentences_pair = dependency.get_ratio(sentences_pair)
sentences_pair

{'simple_text': 'esta es la version complicada',
 'original_text': 'esta es la version complicada',
 'original_text_preprocessed': 'CLR_1.0 WLR_1.0 LR_1.0 DTDR_1.0 ',
 'CLR': 1.0,
 'WLR': 1.0,
 'LR': 1.0,
 'DTDR': 1.0}

In [9]:
word_rank = WordRankRatio()
sentences_pair = word_rank.get_ratio(sentences_pair)
sentences_pair

{'simple_text': 'esta es la version complicada',
 'original_text': 'esta es la version complicada',
 'original_text_preprocessed': 'CLR_1.0 WLR_1.0 LR_1.0 DTDR_1.0 WRR_1.0 ',
 'CLR': 1.0,
 'WLR': 1.0,
 'LR': 1.0,
 'DTDR': 1.0,
 'WRR': 1.0}

Exploring features values from simpleText shared task

In [10]:
lmfill = LMFillMaskRatio()
sentences_pair = lmfill.get_ratio(sentences_pair)
sentences_pair



{'simple_text': 'esta es la version complicada',
 'original_text': 'esta es la version complicada',
 'original_text_preprocessed': 'CLR_1.0 WLR_1.0 LR_1.0 DTDR_1.0 WRR_1.0 LMFMR_1.0 ',
 'CLR': 1.0,
 'WLR': 1.0,
 'LR': 1.0,
 'DTDR': 1.0,
 'WRR': 1.0,
 'LMFMR': 1.0}

In [11]:
SIMPLETEXT_DATASET_PATH = DATASETS_PATH / "simpleText_test"

In [12]:
complex_text = pd.read_csv(SIMPLETEXT_DATASET_PATH / "simpleText_test.test.complex.txt", header=None, sep="\t", names=["original_text"])
simple_text = pd.read_csv(SIMPLETEXT_DATASET_PATH / "simpleText_test.test.simple.txt", header=None, sep="\t",names=["simple_text"])

In [13]:
sentences_pairs = pd.concat([complex_text, simple_text], axis=1)
sentences_pairs = sentences_pairs
sentences_pairs

Unnamed: 0,original_text,simple_text
0,"In the modern era of automation and robotics, ...",Current academic and industrial research is in...
1,With the ever increasing number of unmanned ae...,Drones are increasingly used in the civilian a...
2,Due to guidelines set by the governments regar...,Governments set guidelines on the operation ce...
3,In an attempt to achieve the above mentioned t...,Researchers propose data-driven solutions allo...
4,Derived from the classic image classification ...,"The algorithm, based on the Inception model, d..."
...,...,...
643,Bodybuilders generally train with moderate loa...,Bodybuilders train with moderate loads and sho...
644,"Powerlifters, on the other hand, routinely tra...","Powerlifters, on the other hand, train with hi..."
645,Although both groups are known to display impr...,Although both groups are known to display impr...
646,It has been shown that many factors mediate th...,It has been shown that many factors mediate th...


In [None]:
%%time
results = []
for i,row in sentences_pairs.iterrows():
    sentences_pair = dict(original_text=row['original_text'], simple_text=row['simple_text'])
    sentences_pair = char.get_ratio(sentences_pair)
    sentences_pair = word_length.get_ratio(sentences_pair)
    sentences_pair = levenshtein.get_ratio(sentences_pair)
    sentences_pair = dependency.get_ratio(sentences_pair)
    sentences_pair = word_rank.get_ratio(sentences_pair)
    sentences_pair = lmfill.get_ratio(sentences_pair)
    results.append(sentences_pair)

In [None]:
len(results)
results_df = pd.DataFrame(results)

In [None]:
results_df

Summary statistics

In [None]:
results_df.describe()

In [None]:
results_df.describe()

Character Length Ratio

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["CLR"])
fig.legend(labels=['Char Length'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["WLR"],label="Word Length")
fig.legend(labels=['Word Length'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["LR"])
fig.legend(labels=['Levenshtein'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["LR"])
fig.legend(labels=['Levenshtein'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["DTDR"])
fig.legend(labels=['Deep Tree'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["WRR"])
fig.legend(labels=['Word Rank'])
plt.show()

In [None]:
plt.show()
fig = plt.figure(figsize=(10, 6))
sns.distplot(results_df["LMFMR"])
fig.legend(labels=['Fill Mask'])
plt.show()

In [None]:
results_df.to_csv("simpleText_train_features.csv")

In [None]:
res = pd.read_csv("simpleText_train_features.csv")

In [None]:
res