In [156]:
import os
import io
import pandas as pd
import numpy as np
import spacy as sp
from spacy import displacy
from spacy.tokens import SpanGroup
from spacy.tokens import Doc

In [157]:
nlp = sp.load("en_core_web_sm")
pd.set_option('display.max_colwidth', None)

fp = os.path.join('data', 'stevens_cleaned.csv')
laptop_df = pd.read_csv(fp)
laptop_df = laptop_df.set_index("Unnamed: 0")
laptop_df.index.name = "index"

In [158]:
laptop_df

Unnamed: 0_level_0,Student ID,Sentence,Phrase,Phrase Lemma
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,140+45 and i got 185,140+45,140 + 45
0,,140+45 and i got 185,and i got 185,and I get 185
1,547169,He will be abel to ern enough money if he works for 10 hours.,He will be abel to ern enough money,he will be abel to ern enough money
1,547169,He will be abel to ern enough money if he works for 10 hours.,if he works for 10 hours.,if he work for 10 hour .
2,579366,250,250,250
...,...,...,...,...
36126,572506,I first multiplied 45 times 7 because I neede to know how much he is earning each week.,how much he is earning each week.,how much he be earn each week .
36127,572506,Then I added what was in bank account which was $140 to 350 since $350 was what he earns each day.,Then I added what was in bank account which was $140 to 350,then I add what be in bank account which be $ 140 to 350
36127,572506,Then I added what was in bank account which was $140 to 350 since $350 was what he earns each day.,since $350 was what he earns each day.,since $ 350 be what he earn each day .
36128,572506,"After I added 350 to what I got 2 times until I got 1,085.",After I added 350 to what I got 2 times,after I add 350 to what I get 2 time


In [159]:
#combines two rows based on 2 columns of all the rows to combine
def combine(df, left_phrase, right_phrase, order):
    def combine_phrases(ser):
        text = ""
        for i in ser:
            text += i + " "
        return text.strip()
    def combine_nlp(ser):
        return Doc.from_docs(ser.tolist())
    df.index.name = "index"
    comb = df[(df[left_phrase]) | (df[right_phrase])]
    comb = comb.groupby("index").agg({"Student ID": "max", "Sentence": "max", 
                                      "Phrase": combine_phrases, "Phrase Lemma": combine_phrases,
                                      "Phrase NLP": combine_nlp, "Phrase Length": "sum",
                                      "is CCONJ": "first", "is SCONJ": "first",
                                      "Prev Incomplete": "first", order: "first"})
    other = df[~(df[left_phrase]) & (df[right_phrase] != True)]
    return comb.append(other).sort_values(order).drop(columns=[left_phrase, right_phrase, order])

In [160]:
#group each index
#see which permutation of combinations is most optimal
#stats we need: length of phrase, type of conj, is complete phrase

In [164]:
#get nlp and phrase length

laptop_df["Phrase NLP"] = laptop_df["Phrase"].apply(nlp)
laptop_df["Phrase Length"] = laptop_df["Phrase NLP"].apply(len)

In [181]:
def incomplete_phrase(doc):
    has_noun = False
    has_verb = False
    tokens = [x for x in doc]
    for i in tokens:
        if (i.pos_ == "NOUN") | (i.pos_ == "PRON") | (i.pos_ == "PROPN"):
            has_noun = True
        if (i.pos_ == "VERB"):
            has_verb = True
    return (has_noun & has_verb) == 0

In [182]:
laptop_df["Incomplete Phrase"] = laptop_df["Phrase NLP"].apply(incomplete_phrase)
laptop_df

Unnamed: 0_level_0,Student ID,Sentence,Phrase,Phrase Lemma,Phrase NLP,Phrase Length,Incomplete Phrase
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,,140+45 and i got 185,140+45,140 + 45,"(140, +, 45)",3,True
0,,140+45 and i got 185,and i got 185,and I get 185,"(and, i, got, 185)",4,False
1,547169,He will be abel to ern enough money if he works for 10 hours.,He will be abel to ern enough money,he will be abel to ern enough money,"(He, will, be, abel, to, ern, enough, money)",8,False
1,547169,He will be abel to ern enough money if he works for 10 hours.,if he works for 10 hours.,if he work for 10 hour .,"(if, he, works, for, 10, hours, .)",7,False
2,579366,250,250,250,(250),1,True
...,...,...,...,...,...,...,...
36126,572506,I first multiplied 45 times 7 because I neede to know how much he is earning each week.,how much he is earning each week.,how much he be earn each week .,"(how, much, he, is, earning, each, week, .)",8,False
36127,572506,Then I added what was in bank account which was $140 to 350 since $350 was what he earns each day.,Then I added what was in bank account which was $140 to 350,then I add what be in bank account which be $ 140 to 350,"(Then, I, added, what, was, in, bank, account, which, was, $, 140, to, 350)",14,False
36127,572506,Then I added what was in bank account which was $140 to 350 since $350 was what he earns each day.,since $350 was what he earns each day.,since $ 350 be what he earn each day .,"(since, $, 350, was, what, he, earns, each, day, .)",10,False
36128,572506,"After I added 350 to what I got 2 times until I got 1,085.",After I added 350 to what I got 2 times,after I add 350 to what I get 2 time,"(After, I, added, 350, to, what, I, got, 2, times)",10,False


In [183]:
#get type of conjugate Note: SCONJ combines a dependent and independent phrase, CCONJ combines phrases of same level
laptop_df["is CCONJ"] = laptop_df['Phrase NLP'].apply(lambda x:True if x[0].pos_ == "CCONJ" else False)
laptop_df["is SCONJ"] = laptop_df['Phrase NLP'].apply(lambda x:True if x[0].pos_ == "SCONJ" else False)

In [184]:
#get info on the row before
laptop_df["Prev Incomplete"] = laptop_df["Incomplete Phrase"].shift(1).fillna(False)

In [185]:
def find_bottom(row):
    return (row["is CCONJ"] + row["is SCONJ"]) * row["Incomplete Phrase"] * row["Prev Incomplete"] == 1

In [186]:
#make a function that creates two new columns, signifying the rows to combine
def combine_rows(df):
    this_df = df.copy()
    this_df["Bottom"] = df.apply(find_bottom, axis=1)
    this_df["Top"] = this_df["Bottom"].shift(-1).fillna(False)
    this_df["Order"] = range(this_df.shape[0])
    return combine(this_df, "Bottom", "Top", "Order")
#combine_rows(laptop_df.loc[36123])

In [187]:
#fill incomplete phrase nan values

In [188]:
copy = laptop_df.copy()

In [189]:
copy = copy.groupby("index").apply(combine_rows)
copy = copy.droplevel(0)

In [191]:
incomplete = copy[copy["Incomplete Phrase"].isna()]
complete = copy[copy["Incomplete Phrase"].notna()]
incomplete["Incomplete Phrase"] = incomplete["Phrase NLP"].apply(incomplete_phrase)
incomplete.append(complete).sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incomplete["Incomplete Phrase"] = incomplete["Phrase NLP"].apply(incomplete_phrase)


Unnamed: 0_level_0,Student ID,Sentence,Phrase,Phrase Lemma,Phrase NLP,Phrase Length,is CCONJ,is SCONJ,Prev Incomplete,Incomplete Phrase
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,,140+45 and i got 185,140+45,140 + 45,"(140, +, 45)",3,False,False,False,True
0,,140+45 and i got 185,and i got 185,and I get 185,"(and, i, got, 185)",4,True,False,True,False
1,547169,He will be abel to ern enough money if he works for 10 hours.,if he works for 10 hours.,if he work for 10 hour .,"(if, he, works, for, 10, hours, .)",7,False,True,False,False
1,547169,He will be abel to ern enough money if he works for 10 hours.,He will be abel to ern enough money,he will be abel to ern enough money,"(He, will, be, abel, to, ern, enough, money)",8,False,False,False,False
2,579366,250,250,250,(250),1,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
36126,572506,I first multiplied 45 times 7 because I neede to know how much he is earning each week.,how much he is earning each week.,how much he be earn each week .,"(how, much, he, is, earning, each, week, .)",8,False,True,False,False
36127,572506,Then I added what was in bank account which was $140 to 350 since $350 was what he earns each day.,Then I added what was in bank account which was $140 to 350,then I add what be in bank account which be $ 140 to 350,"(Then, I, added, what, was, in, bank, account, which, was, $, 140, to, 350)",14,False,False,False,False
36127,572506,Then I added what was in bank account which was $140 to 350 since $350 was what he earns each day.,since $350 was what he earns each day.,since $ 350 be what he earn each day .,"(since, $, 350, was, what, he, earns, each, day, .)",10,False,True,False,False
36128,572506,"After I added 350 to what I got 2 times until I got 1,085.",After I added 350 to what I got 2 times,after I add 350 to what I get 2 time,"(After, I, added, 350, to, what, I, got, 2, times)",10,False,True,False,False
