In [None]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
def find_mere_sents(synt, targetExp):
    for parsed_token in synt:
        # If "mere" is complementing the target expression:
        if parsed_token[0].lower() == "mere" and parsed_token[3].lower() == targetExp.lower():
            return True
    return False

In [None]:
def find_relv_tokens(synt, tfidf):
    tokens = [t[0] for t in synt if not t[0] in stopwords.words('english') and t[1] in ["ADJ", "ADV", "NOUN", "PROPN", "VERB"]]
    return tfidf[tfidf.index.isin(tokens)].nlargest(5,'TF-IDF').index.values.tolist()

In [None]:
for corpus in ["JSA", "RSC", "HMD", "BLB"]:
    df = pd.read_pickle("experiments/" + corpus.lower() + "_clusters.pkl")
    df["fullctxt"] = df["prevSentence"] + " " + df["currentSentence"] + " " + df["nextSentence"]

    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(df["currentSentence"].values.astype('U'))
    tfidf_df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    tfidf_df = tfidf_df.sort_values('TF-IDF', ascending=False)

    df["mere_sent"] = df.apply(lambda x: find_mere_sents(x.synt, x.targetExpression), axis=1)
    meredf = df[df["mere_sent"] == True]
    meredf["tfidf_top_tokens"] = meredf.apply(lambda x: find_relv_tokens(x.synt, tfidf_df), axis=1)

    df.to_csv("experiments/" + corpus.lower() + "_withmere.tsv", sep="\t")
    df.to_json("experiments/" + corpus.lower() + "_withmere.json")
    df.to_pickle("experiments/" + corpus.lower() + "_withmere.pkl")

    meredf.to_csv("experiments/" + corpus.lower() + "_onlymere.tsv", sep="\t")
    meredf.to_json("experiments/" + corpus.lower() + "_onlymere.json")
    meredf.to_pickle("experiments/" + corpus.lower() + "_onlymere.pkl")

In [None]:
print("Number of mere machines divided by all machines:")
for corpus in ["JSA", "RSC", "HMD", "BLB"]:
    df = pd.read_pickle("experiments/" + corpus.lower() + "_withmere.pkl")
    print("*", corpus + ":\t", round(df[df["mere_sent"] == True].shape[0]/df.shape[0],6))