In [22]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import collections

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

def read_pickle(path_in, name_in):
    return pickle.load(open(path_in + name_in + ".pk", 'rb'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haleychen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
data = read_pickle("/Users/haleychen/Downloads/", "hw4")

### Preprocessing 

In [6]:
df=data['body']
df_copy=df.copy()
df.head()

0     We use essential cookies to make Venngage wor...
1    A legal contract is a written document that is...
2     November 27 2023 14 min Author Olga Asheychik...
3    Accelerate contracts with AI native workflows ...
4    Create smarter agreements commit to them more ...
Name: body, dtype: object

In [7]:


# 1. Remove Stop Words
def rem_sw(text):
    nltk.download('stopwords', quiet=True)
    sw = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word.lower() not in sw])

# 2. Stemming or Lemmatization
def stem_fun(text, method='ps'):
    nltk.download('wordnet', quiet=True)
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    words = text.split()
    if method == 'ps':
        return ' '.join([stemmer.stem(w) for w in words])
    elif method == 'lemma':
        return ' '.join([lemmatizer.lemmatize(w) for w in words])
    else:
        raise ValueError("method must be 'ps' (PorterStemmer) or 'lemma'")

# 3. Vectorization (Count / TF-IDF)
def vec_fun(text_series, m_in=1, n_in=1, vec_type='tf', o_path=None):
    if vec_type == "tf":
        vectorizer = CountVectorizer(ngram_range=(m_in, n_in))
    else:
        vectorizer = TfidfVectorizer(ngram_range=(m_in, n_in))
    
    vec_data = vectorizer.fit_transform(text_series)
    df_out = pd.DataFrame(vec_data.toarray(), columns=vectorizer.get_feature_names_out())
    
    if o_path:
        os.makedirs(os.path.dirname(o_path), exist_ok=True)
        with open(o_path, 'wb') as f:
            pickle.dump(vectorizer, f)
    
    return df_out

In [18]:


df = data['body']  

# Apply stopword removal, then stemming/lemmatization, then vectorization
df = df.apply(rem_sw)  # Remove stopwords
df = df.apply(lambda x: stem_fun(x, method='lemma'))  # Apply lemmatization

data['cleaned_body']=df 



In [20]:
data.head()

Unnamed: 0,body,label,cleaned_body
0,We use essential cookies to make Venngage wor...,legal_contract_examples,use essential cooky make Venngage work clickin...
1,A legal contract is a written document that is...,legal_contract_examples,legal contract written document drawn party ag...
2,November 27 2023 14 min Author Olga Asheychik...,legal_contract_examples,November 27 2023 14 min Author Olga Asheychik ...
3,Accelerate contracts with AI native workflows ...,legal_contract_examples,Accelerate contract AI native workflow Advance...
4,Create smarter agreements commit to them more ...,legal_contract_examples,Create smarter agreement commit efficiently ma...


In [24]:
data['label'].unique()

array(['legal_contract_examples', 'marketing_material_examples',
       'engineering_specification_examples'], dtype=object)

In [30]:
# handling cat var 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])  # Converts labels to 0, 1, 2
data.head()

Unnamed: 0,body,label,cleaned_body
0,We use essential cookies to make Venngage wor...,1,use essential cooky make Venngage work clickin...
1,A legal contract is a written document that is...,1,legal contract written document drawn party ag...
2,November 27 2023 14 min Author Olga Asheychik...,1,November 27 2023 14 min Author Olga Asheychik ...
3,Accelerate contracts with AI native workflows ...,1,Accelerate contract AI native workflow Advance...
4,Create smarter agreements commit to them more ...,1,Create smarter agreement commit efficiently ma...


# ML functions 

In [48]:



X = data["cleaned_body"]
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom placeholder transformers for embedding types
class PretrainedEmbeddingTransformer(TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return np.random.rand(len(X), 300)  # Replace with real embedding logic

class DomainEmbeddingTransformer(TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return np.random.rand(len(X), 300)  # Replace with real domain-trained embedding logic

# Helper function
def evaluate_model(X_train, X_test, y_train, y_test, vectorizer, apply_chi2=False, k=1000):
    if isinstance(vectorizer, (CountVectorizer, TfidfVectorizer)):
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        if apply_chi2:
            chi2_selector = SelectKBest(chi2, k=min(k, X_train_vec.shape[1]))
            X_train_vec = chi2_selector.fit_transform(X_train_vec, y_train)
            X_test_vec = chi2_selector.transform(X_test_vec)
    else:
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    return precision, recall, fscore

# Run all models
results = []

ngram_ranges = [(1,1), (1,2), (1,3)]
vectorizer_types = ["tf", "tfidf"]
chi2_options = [False, True]

for ngram in ngram_ranges:
    for vec_type in vectorizer_types:
        for use_chi2 in chi2_options:
            if vec_type == "tf":
                vectorizer = CountVectorizer(ngram_range=ngram)
                vec_name = "tf"
            else:
                vectorizer = TfidfVectorizer(ngram_range=ngram)
                vec_name = "tf-idf"
            
            vec_label = f"{vec_name} {'with' if use_chi2 else 'without'} chi-squared"
            precision, recall, fscore = evaluate_model(X_train, X_test, y_train, y_test, vectorizer, use_chi2)
            results.append(["Random Forest", ngram, "body", vec_label, precision, recall, fscore])

# Pretrained embedding
for emb_type, transformer in [("pre-trained embedding", PretrainedEmbeddingTransformer()), 
                              ("domain embedding", DomainEmbeddingTransformer())]:
    precision, recall, fscore = evaluate_model(X_train, X_test, y_train, y_test, transformer)
    results.append(["Random Forest", "N/A", "body", emb_type, precision, recall, fscore])

# Final dataframe
results_df = pd.DataFrame(results, columns=["Model", "n-gram", "column", "vectorizer", "precision", "recall", "fscore"])


In [50]:

results_df = results_df.sort_values(by=['vectorizer', 'n-gram'])
print(results_df)

            Model  n-gram column                  vectorizer  precision  \
13  Random Forest     N/A   body            domain embedding   0.413603   
12  Random Forest     N/A   body       pre-trained embedding   0.382222   
1   Random Forest  (1, 1)   body         tf with chi-squared   0.758304   
5   Random Forest  (1, 2)   body         tf with chi-squared   0.794771   
9   Random Forest  (1, 3)   body         tf with chi-squared   0.744180   
0   Random Forest  (1, 1)   body      tf without chi-squared   0.864646   
4   Random Forest  (1, 2)   body      tf without chi-squared   0.864646   
8   Random Forest  (1, 3)   body      tf without chi-squared   0.864646   
3   Random Forest  (1, 1)   body     tf-idf with chi-squared   0.876190   
7   Random Forest  (1, 2)   body     tf-idf with chi-squared   0.876190   
11  Random Forest  (1, 3)   body     tf-idf with chi-squared   0.876190   
2   Random Forest  (1, 1)   body  tf-idf without chi-squared   0.862626   
6   Random Forest  (1, 2)

In [52]:
results_df.to_csv('/Users/haleychen/Columbia_NLP/sorted_results.csv', index=False)