In [1]:
import pandas as pd

# Read in the cleaned and labelled data

In [2]:
cleaned_labelled_data = pd.read_csv('../../data/IMDB_datasets/clean_imdb_dataset.csv')

In [3]:
just_review_and_label = cleaned_labelled_data.iloc[:, 0:2]

In [33]:
just_review_and_label['review'][0]

just_review_and_label.shape[0]

49581

In [36]:
just_review_and_label['words'] = [0 for _ in range(just_review_and_label.shape[0])]


for review_index in range(just_review_and_label.shape[0]):
    just_review_and_label['words'][review_index] = just_review_and_label['review'][review_index].split()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  just_review_and_label['words'][review_index] = just_review_and_label['review'][review_index].split()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

In [37]:
just_review_and_label

Unnamed: 0,review,sentiment,words
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production The filming te...,positive,"[A, wonderful, little, production, The, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically theres a family where a little boy J...,negative,"[Basically, theres, a, family, where, a, littl..."
4,Petter Matteis Love in the Time of Money is a ...,positive,"[Petter, Matteis, Love, in, the, Time, of, Mon..."
...,...,...,...
49576,I thought this movie did a down right good job...,positive,"[I, thought, this, movie, did, a, down, right,..."
49577,Bad plot bad dialogue bad acting idiotic direc...,negative,"[Bad, plot, bad, dialogue, bad, acting, idioti..."
49578,I am a Catholic taught in parochial elementary...,negative,"[I, am, a, Catholic, taught, in, parochial, el..."
49579,Im going to have to disagree with the previous...,negative,"[Im, going, to, have, to, disagree, with, the,..."


# Read in each of the three cleaned and NE tagged data

In [26]:
def read_conll_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

In [None]:
test_10 = read_conll_file('../../data/imdb_test_train_datasets/test_tagged/test_9010_tagged_output.iob2')

test_50 = read_conll_file('../../data/imdb_test_train_datasets/test_tagged/test_5050_tagged_output.iob2')

test_90 = read_conll_file('../../data/imdb_test_train_datasets/test_tagged/test_1090_tagged_output.iob2')

In [28]:
test_10_df = pd.DataFrame(test_10, columns = ['words', 'ne_tags'])
test_50_df = pd.DataFrame(test_50, columns = ['words', 'ne_tags'])
test_90_df = pd.DataFrame(test_90, columns = ['words', 'ne_tags'])

In [29]:
def prepare_space_for_sentiment(tagged_dataframe):

    size = tagged_dataframe.shape[0]

    tagged_dataframe['sentiment'] = [0 for _ in range(size)]
    tagged_dataframe['review'] = [0 for _ in range(size)]

    for point in range(size):
        tagged_dataframe['review'][point] = ' '.join(tagged_dataframe['words'][point])

    prepared_dataframe = tagged_dataframe

    return prepared_dataframe
    

In [30]:
test_10_prepped = prepare_space_for_sentiment(test_10_df)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  tagged_dataframe['review'][point] = ' '.join(tagged_dataframe['words'][point])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [31]:
test_50_prepped = prepare_space_for_sentiment(test_50_df)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  tagged_dataframe['review'][point] = ' '.join(tagged_dataframe['words'][point])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [32]:
test_90_prepped = prepare_space_for_sentiment(test_90_df)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  tagged_dataframe['review'][point] = ' '.join(tagged_dataframe['words'][point])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

What's the shortest review of our test set?

In [45]:
y = just_review_and_label
y_size = y.shape[0]

shortest_review = 100

for y_index in range(y_size):
    y_words = len(y['words'][y_index])

    if y_words < shortest_review:
        shortest_review = y_words
        print(shortest_review)

length_to_check = shortest_review

print(length_to_check)


46
44
30
29
26
25
21
10
10


In [51]:
y = just_review_and_label
y_size = y.shape[0]

a=0

for second_index in range(a+1,y_size):
    first_y_words = y['words'][a]
    second_y_words = y['words'][second_index]

    if first_y_words == second_y_words:
        print(first_y_words)
        a+=1


print(a)


0


Where are duplicates happening

In [53]:
x = just_review_and_label
y = test_10_df

x_size = x.shape[0]
y_size = y.shape[0]

length_to_check = 50

success_checks_performed = 0

for y_index in range(y_size):
    y_words = y['words'][y_index]
    double = 0

    

    for x_index in range(x_size):
        x_words = x['words'][x_index]
        

        if x_words[0:length_to_check] == y_words[0:length_to_check]:
            double+=1
            if double == 2:
                print(y_words)
            success_checks_performed +=1

print('Successful check count:', success_checks_performed)


['Kurt', 'Russell', 'is', 'at', 'his', 'best', 'as', 'the', 'man', 'who', 'lives', 'off', 'his', 'past', 'glories', 'Reno', 'Hightower', 'Robin', 'Williams', 'is', 'his', 'polar', 'opposite', 'in', 'a', 'rare', 'low', 'key', 'performance', 'as', 'Jack', 'Dundee', 'He', 'dropped', 'the', 'Big', 'Pass', 'in', 'more', 'ways', 'than', 'onebr', 'Youll', 'see', 'some', 'of', 'the', 'most', 'quotable', 'scenes', 'ever', 'put', 'into', 'one', 'film', 'as', 'Jack', 'hisses', 'at', 'a', 'rat', 'Reno', 'poses', 'and', 'the', 'call', 'of', 'the', 'caribou', 'goes', 'outbr', 'Dont', 'miss', 'this', 'classic', 'that', 'isnt', 'scared', 'to', 'show', 'football', 'in', 'the', 'mud', 'the', 'way', 'it', 'should', 'be', 'played', 'note', 'to', 'the', 'NFL']
['Ah', 'yet', 'another', 'Seagal', 'movieIn', 'no', 'less', 'than', 'a', 'few', 'mere', 'months', 'arrive', 'to', 'populate', 'the', 'video', 'store', 'shelvesAs', 'bad', 'as', 'SubmergedNoBut', 'that', 'is', 'not', 'saying', 'muchLike', 'perfume', '

how large can length stay whilst doing all 4959 checks?

100 length did 4882 successful checks

weirdly, 50 did 4967, implying maybe 8 overlaps

# Function that checks for equivalent reviews, and adds the labels to the tagged data so that we have 3 Cleaned, Tagged and Labelled test data sets

In [83]:
def check_for_twin_reviews(df_cleaned_and_tagged, df_cleaned_and_labelled):

    x = df_cleaned_and_tagged
    y = df_cleaned_and_labelled

    z = df_cleaned_and_tagged

    labelled_size = y.shape[0]
    tagged_size = x.shape[0]

    for tagged_index in range(tagged_size):
        tagged_words = x['words'][tagged_index]
        cropped_size = len(tagged_words)

        for labelled_index in range(labelled_size):
            labelled_words = y['words'][labelled_index]
    
            if tagged_words == labelled_words[0:cropped_size]:
                z['sentiment'][tagged_index] = y['sentiment'][labelled_index]

                break


    return z

In [84]:
test_10_tagged_labelled = check_for_twin_reviews(df_cleaned_and_labelled=just_review_and_label, df_cleaned_and_tagged=test_10_prepped)

In [73]:
def did_check_work(final_df):

    uh_oh = 0
    negs = 0
    possies = 0
    result = 'It flunked'

    for sentiment in final_df['sentiment']:
        if sentiment == 'negative':
            negs+=1
        elif sentiment == 'positive':
            possies +=1
        else:
            uh_oh+=1

    if uh_oh == 0:
        result = 'It worked'


    return (possies, negs, uh_oh, result)

In [81]:
did_check_work(final_df=test_10_tagged_labelled)

(2489, 2470, 0, 'It worked')

In [76]:
test_50_tagged_labelled = check_for_twin_reviews(df_cleaned_and_labelled=just_review_and_label, df_cleaned_and_tagged=test_50_prepped)

In [82]:
did_check_work(final_df=test_50_tagged_labelled)

(2489, 2470, 0, 'It worked')

In [78]:
test_90_tagged_labelled = check_for_twin_reviews(df_cleaned_and_labelled=just_review_and_label, df_cleaned_and_tagged=test_90_prepped)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  z['sentiment'][tagged_index] = y['sentiment'][labelled_index]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z

In [79]:
did_check_work(test_90_tagged_labelled)

(2489, 2470, 0, 'It worked')

In [80]:
test_50_tagged_labelled

Unnamed: 0,words,ne_tags,sentiment,review
0,"[Poorly, done, political, actioner, Badly, pho...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",negative,Poorly done political actioner Badly photograp...
1,"[Not, good, Mostly, because, you, dont, give, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",negative,Not good Mostly because you dont give a damn a...
2,"[FUTZ, is, the, only, show, preserved, from, t...","[O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-...",positive,FUTZ is the only show preserved from the exper...
3,"[Are, you, kidding, me, A, show, highlighting,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",negative,Are you kidding me A show highlighting someone...
4,"[If, you, like, to, watch, movies, because, th...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",negative,If you like to watch movies because they are p...
...,...,...,...,...
4954,"[For, a, TV, movie, this, was, definately, wor...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",positive,For a TV movie this was definately worth seein...
4955,"[MILD, SPOILERSbr, In, this, wouldbe, satire, ...","[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...",negative,MILD SPOILERSbr In this wouldbe satire Chaplin...
4956,"[When, evaluating, documentaries, that, focus,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",negative,When evaluating documentaries that focus a rel...
4957,"[This, show, comes, up, with, interesting, loc...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",negative,This show comes up with interesting locations ...
