In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, make_scorer
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec



In [4]:
data = pd.read_csv('contracts-clauses-datasets.csv')


In [5]:
data.dropna(inplace=True)


In [6]:
# Preprocess the text data

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text) # Remove punctuation and special characters
    text = text.lower() # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words]) # Remove stop words
    return text
data['Text'] = data['Text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Split the dataset into training and testing sets

train_data, test_data, train_labels, test_labels = train_test_split(data['Text'], data['Label'], test_size=0.2, random_state=42)


In [8]:
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_data)
test_features = vectorizer.transform(test_data)

In [9]:
clf = MultinomialNB()
clf.fit(train_features, train_labels)

In [10]:
test_predictions = clf.predict(test_features)


In [42]:
cm = pd.crosstab(test_labels, test_predictions, rownames=['True Label'], colnames=['Predicted Label'], margins=True)
print("Multinomial Naive Bayes Confusion matrix:\n", cm)

print("Classification Report:")
print(classification_report(test_labels, test_predictions))

Multinomial Naive Bayes Confusion matrix:
 Predicted Label         Assignment  Confidentiality  Counterparts  \
True Label                                                          
Assignment                     101                0             0   
Confidentiality                  0               74             0   
Counterparts                     0                0           121   
Definitions                      0                1             0   
Entire                           0                1             0   
Governing                        0                0             0   
Headings                         0                0             0   
Indemnification                  0                0             0   
Insurance                        0                0             0   
Miscellaneous                    4                0             7   
NOW                              0                0             0   
Notices                          0                0         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
#SVM
clf = SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(train_features, train_labels)
test_predictions = clf.predict(test_features)

# Calculate the confusion matrix
cm = confusion_matrix(test_labels, test_predictions)

print("SVM Confusion matrix:\n", cm)
print(classification_report(test_labels, test_predictions))


SVM Confusion matrix:
 [[101   0   0 ...   0   0   0]
 [  0  74   0 ...   0   0   0]
 [  0   0 121 ...   0   0   0]
 ...
 [  0   0   0 ...   2   0   0]
 [  0   0   0 ...   0 195   0]
 [  0   0   0 ...   0   0  30]]
                        precision    recall  f1-score   support

            Assignment       0.95      0.94      0.94       108
       Confidentiality       0.96      0.99      0.97        75
          Counterparts       0.95      0.98      0.96       123
           Definitions       0.89      0.90      0.89       176
                Entire       0.99      0.97      0.98       143
             Governing       0.90      1.00      0.94        60
              Headings       0.98      0.99      0.99       168
       Indemnification       0.92      0.92      0.92        60
             Insurance       0.99      0.97      0.98        87
         Miscellaneous       0.75      0.59      0.66       117
                   NOW       0.96      1.00      0.98       103
               N

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Train a Decision Tree classifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(train_features, train_labels)
# Predict labels for the test data using Decision Tree
dtc_predictions = dtc.predict(test_features)
dtc_cm = confusion_matrix(test_labels, dtc_predictions)

print("Decision Tree Confusion matrix:\n", dtc_cm)


Decision Tree Confusion matrix:
 [[ 95   0   0 ...   0   1   0]
 [  0  66   0 ...   0   0   0]
 [  0   0 116 ...   0   0   0]
 ...
 [  0   0   0 ...   1   0   0]
 [  0   1   0 ...   0 183   0]
 [  0   0   0 ...   1   0  24]]


In [13]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(train_features, train_labels)

# Predict labels for the test data using Random Forest
rfc_predictions = rfc.predict(test_features)
rfc_cm = confusion_matrix(test_labels, rfc_predictions)
print("Random Forest Confusion matrix:\n", rfc_cm)




Random Forest Confusion matrix:
 [[101   0   0 ...   0   0   0]
 [  0  74   0 ...   0   0   0]
 [  0   0 121 ...   0   0   0]
 ...
 [  0   0   0 ...   1   0   0]
 [  0   0   0 ...   0 200   0]
 [  0   0   0 ...   0   1  27]]


In [14]:
print("Decision Tree Classification Report:\n", classification_report(test_labels, dtc_predictions))
print("Random Forest Classification Report:\n", classification_report(test_labels, rfc_predictions))

Decision Tree Classification Report:
                         precision    recall  f1-score   support

            Assignment       0.90      0.88      0.89       108
       Confidentiality       0.85      0.88      0.86        75
          Counterparts       0.94      0.94      0.94       123
           Definitions       0.77      0.87      0.82       176
                Entire       0.96      0.83      0.89       143
             Governing       0.90      0.93      0.92        60
              Headings       0.98      0.96      0.97       168
       Indemnification       0.81      0.77      0.79        60
             Insurance       0.80      0.84      0.82        87
         Miscellaneous       0.45      0.41      0.43       117
                   NOW       0.94      0.99      0.97       103
               Notices       0.92      0.88      0.90        75
       Representations       0.94      0.96      0.95       112
          Severability       0.92      0.94      0.93       114
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Classification Report:
                         precision    recall  f1-score   support

            Assignment       0.90      0.94      0.92       108
       Confidentiality       0.94      0.99      0.96        75
          Counterparts       0.94      0.98      0.96       123
           Definitions       0.84      0.90      0.87       176
                Entire       0.95      0.99      0.97       143
             Governing       0.80      1.00      0.89        60
              Headings       0.95      0.99      0.97       168
       Indemnification       0.89      0.92      0.90        60
             Insurance       0.98      0.98      0.98        87
         Miscellaneous       0.86      0.32      0.46       117
                   NOW       0.98      1.00      0.99       103
               Notices       0.94      0.99      0.96        75
       Representations       0.98      0.99      0.99       112
          Severability       0.93      0.99      0.96       114
 