Exploring several features

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from source.features import *

In [None]:
RESOURCES_DIR = Path('../resources')
DATASETS_PATH = RESOURCES_DIR / "datasets"
WORD_EMBEDDINGS_NAME = "glove.42B.300d"
DUMPS_DIR = RESOURCES_DIR / "DUMPS"

In [None]:
sentences_pair = {"simple_text": "esta es la version complicada", "original_text": "esta es la version complicada"}

In [None]:
char = CharLengthRatio()
sentences_pair = char.get_ratio(sentences_pair)
sentences_pair

In [None]:
char.name

In [None]:
word_length = WordLengthRatio()
sentences_pair = word_length.get_ratio(sentences_pair)
sentences_pair

In [None]:
levenshtein = LevenshteinRatio()
sentences_pair = levenshtein.get_ratio(sentences_pair)
sentences_pair

In [None]:
dependency = DependencyTreeDepthRatio()
sentences_pair = dependency.get_ratio(sentences_pair)
sentences_pair

In [None]:
word_rank = WordRankRatio()
sentences_pair = word_rank.get_ratio(sentences_pair)
sentences_pair

Exploring features values from simpleText shared task

In [None]:
lmfill = LMFillMaskRatio()
sentences_pair = lmfill.get_ratio(sentences_pair)
sentences_pair

In [None]:
SIMPLETEXT_DATASET_PATH = DATASETS_PATH / "simpleText_test"

In [None]:
complex_text = pd.read_csv(SIMPLETEXT_DATASET_PATH / "simpleText_test.test.complex.txt", header=None, sep="\t", names=["original_text"])
simple_text = pd.read_csv(SIMPLETEXT_DATASET_PATH / "simpleText_test.test.simple.txt", header=None, sep="\t",names=["simple_text"])

In [None]:
sentences_pairs = pd.concat([complex_text, simple_text], axis=1)
sentences_pairs = sentences_pairs[:10]
sentences_pairs

In [None]:
%%time
results = []
for i,row in sentences_pairs.iterrows():
    sentences_pair = dict(original_text=row['original_text'], simple_text=row['simple_text'])
    sentences_pair = char.get_ratio(sentences_pair)
    sentences_pair = word_length.get_ratio(sentences_pair)
    sentences_pair = levenshtein.get_ratio(sentences_pair)
    sentences_pair = dependency.get_ratio(sentences_pair)
    sentences_pair = word_rank.get_ratio(sentences_pair)
    sentences_pair = lmfill.get_ratio(sentences_pair)
    results.append(sentences_pair)

In [None]:
len(results)
results_df = pd.DataFrame(results)

In [None]:
results_df

Summary statistics

In [None]:
results_df.describe()

In [None]:
results_df.describe()

Character Length Ratio

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["CLR"])
fig.legend(labels=['Char Length'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["WLR"],label="Word Length")
fig.legend(labels=['Word Length'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["LR"])
fig.legend(labels=['Levenshtein'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["LR"])
fig.legend(labels=['Levenshtein'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["DTDR"])
fig.legend(labels=['Deep Tree'])
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))
sns.distplot(results_df["WRR"])
fig.legend(labels=['Word Rank'])
plt.show()

In [None]:
plt.show()
fig = plt.figure(figsize=(10, 6))
sns.distplot(results_df["LMFMR"], bins=[0,1,2,3,4,5,6])
fig.legend(labels=['Fill Mask'])
plt.show()

In [None]:
results_df.to_csv("simpleText_train_features.csv")

In [None]:
res = pd.read_csv("simpleText_train_features.csv")

In [None]:
res