# Set up Libraries

In [64]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

### Step 1 - Create list of files in directory and import them

In [65]:
def read_files_into_dataframe(directory):
    files = os.listdir(directory)
    df = pd.DataFrame()
    for file in files:
        file_path = os.path.join(directory, file)
        new_data = pd.read_csv(file_path)
        df = pd.concat([df, new_data])
        print(file)
    return df

directory = 'Clauses'
data = read_files_into_dataframe(directory)
print(data.head(20))

indemnification-and-contribution.csv
indemnification.csv
confidentiality.csv
indemnification-by-the-company.csv
contribution.csv
participations.csv
arbitration.csv
confidential-information.csv
capitalization.csv
payment-of-expenses.csv
                                          clause_text                       clause_type
0   Indemnification and Contribution. (a) The Comp...  indemnification-and-contribution
1   Indemnification and Contribution. (a) The Comp...  indemnification-and-contribution
2   Indemnification and Contribution. (a) The Comp...  indemnification-and-contribution
3   Indemnification and Contribution. (a) The Comp...  indemnification-and-contribution
4   Indemnification and Contribution. A. The Issue...  indemnification-and-contribution
5   Indemnification and Contribution. (a) The Comp...  indemnification-and-contribution
6   Indemnification and Contribution. (a) In the e...  indemnification-and-contribution
7   Indemnification and Contribution. (a) The Comp...  indem

### Step 2 - Remove the name of the clause from the Clause Text by removing all text up to and including the first period 

In [66]:
# create a new column without the text up to the first period
data['modified_clause_text'] = data['clause_text'].str.replace("^[^.]*\.", "", regex=True)

In [67]:
# export to allow eyeball check
data.to_csv('exported_dataframe.csv', index=False)

In [68]:
# drop the clause text column
data.drop('clause_text', axis=1, inplace=True)

In [69]:
# Get basic data description by category
unique_values = data['clause_type'].unique()
print(unique_values)

['indemnification-and-contribution' 'indemnification' 'confidentiality'
 'indemnification-by-the-company' 'contribution' 'participations'
 'arbitration' 'confidential-information' 'capitalization'
 'payment-of-expenses']


### Step 3 - Explore the Dataset

In [70]:
# Count number of records
print(len(data))

2150


In [71]:
#  Get average number of words in modified_clause_text
# Assuming that 'data' is the DataFrame and 'modified_clause_text' is the column name
average_word_count = data['modified_clause_text'].str.split().str.len().mean()
rounded_average_word_count = round(average_word_count)
print(f"Rounded average word count: {rounded_average_word_count}")

Rounded average word count: 243


In [72]:
# Get some descriptive stats by category
labels = data['clause_type'].unique()

# First, create a new column 'word_count' which represents the word count of each record in 'modified_clause_text' column
data['word_count'] = data['modified_clause_text'].str.split().str.len()

results = []  
# Iterate over each label
for label in labels:
    # Filter the records for the current label
    label_data = data[data['clause_type'] == label]

    # Calculate stats for current label
    record_count = len(label_data)
    min_word_count = label_data['word_count'].min()
    max_word_count = label_data['word_count'].max()
    avg_word_count = round(label_data['word_count'].mean())

    # Append the stats for current label to the results list
    results.append({'Label': label,
                    'Record Count': record_count,
                    'Min Word Count': min_word_count,
                    'Max Word Count': max_word_count,
                    'Avg Word Count': avg_word_count})

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.options.display.float_format = '{:.2f}'.format

# Display the DataFrame
print(results_df.to_string(index=False))



                           Label  Record Count  Min Word Count  Max Word Count  Avg Word Count
indemnification-and-contribution           180               5             415             303
                 indemnification           210               4             418             247
                 confidentiality           220               6             416             242
  indemnification-by-the-company           230               4             430             218
                    contribution           180              50             424             327
                  participations           210              29             428             253
                     arbitration           240               2             408             212
        confidential-information           240               3             410             195
                  capitalization           200              13             422             275
             payment-of-expenses           240    

### Step 4 - Split Data into Training and Test

In [73]:
# Assuming 'data' is your DataFrame
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

### Step 5 - Save the Train and Test Data for use by the Notebooks for each Model

In [74]:
# Save training set to CSV
train_data.to_csv("train_data.csv", index=False)

# Save test set to CSV
test_data.to_csv("test_data.csv", index=False)


Candidates
- SCI-Kitlearn TFIDF vectoriser - BOW words approach
- Word 2 Vec - CBOW TFIDF
- Doc2Vec
- Universal Sentence Encoder (Tensorflow Hub)
- 